Skip to content

Commit

Permalink
Added female bias objective function
Browse files Browse the repository at this point in the history
  • Loading branch information
Rounique committed Dec 7, 2023
1 parent 0122ac6 commit 9c9bee5
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 23 deletions.
12 changes: 6 additions & 6 deletions src/cmn/team.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,12 @@ def bucketing(bucket_size, s2i, c2i, female_indices_list, l2i, location_type, te
@classmethod
def generate_sparse_vectors(cls, datapath, output, filter, settings):
pkl = f'{output}/teamsvecs.pkl'
gender_csv = f'{output}/i2gender.csv'
df = pd.read_csv(gender_csv)
df.columns = ['Expert_Index', 'Gender']
df['Gender'] = df['Gender'].astype(bool)
female_indices = df[df['Gender'] == False]['Expert_Index']
female_indices_list = female_indices.tolist()
# gender_csv = f'{output}/i2gender.csv'
# df = pd.read_csv(gender_csv)
# df.columns = ['Expert_Index', 'Gender']
# df['Gender'] = df['Gender'].astype(bool)
# female_indices = df[df['Gender'] == False]['Expert_Index']
# female_indices_list = female_indices.tolist()

try:
st = time()
Expand Down
6 changes: 3 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ def run(data_list, domain_list, fair, filter, future, augment, model_list, outpu
prep_output = f'./../data/preprocessed/{d_name}/{os.path.split(datapath)[-1]}'
vecs, indexes = d_cls.generate_sparse_vectors(datapath, f'{prep_output}{filter_str}', filter, settings['data'])
gender = pd.read_csv(f'{prep_output}{filter_str}/females.csv', index_col=None)

vecs['gender'] = lil_matrix((1, vecs['member'].shape[0]), gender)# as a single sparse vector 1 * |size of expert| whose nonzero indexes are the file indexes

female_ids = gender['opentf_index'].values.tolist()
vecs['gender'] = lil_matrix((1, vecs['member'].shape[0]))# as a single sparse vector 1 * |size of expert| whose nonzero indexes are the file indexes
vecs['gender'][:, female_ids] = 1.0
if augment:
from tqdm import tqdm
for row in tqdm(range(0, vecs['member'].shape[0])):
Expand Down
60 changes: 47 additions & 13 deletions src/mdl/fnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,26 @@ def cross_entropy(self, y_, y, ns, nns, unigram):
if ns == "unigram_b": return self.ns_unigram_mini_batch(y_['pred'], y, nns)
if ns == "inverse_unigram" or ns.startswith("temporal_inverse_unigram"): return self.ns_inverse_unigram(y_['pred'], y, unigram, nns)
if ns == "inverse_unigram_b": return self.ns_inverse_unigram_mini_batch(y_['pred'], y, nns)
if ns == "female_bias": return self.female_bias(y_, y, nns)
if ns == "female_bias": return self.female_bias(y_, y)
if ns == "fair_inverse_unigram_b": return self.ns_fair_inverse_unigram_b(y_, y, nns)
# return self.weighted(y_, y)
cri = nn.BCELoss()
return cri(y_['pred'].squeeze(1), y.squeeze(1))

def female_bias(self, logits_gender, targets, pos_weight=2.5):
def female_bias(self, logits_gender, targets, pos_weight=2.5, female_weight=2):
logits = logits_gender['pred']
genders = logits_gender['gender']
genders = logits_gender['gender'].nonzero()[1]
targets = targets.squeeze(1)
logits = logits.squeeze(1)
# this line should be adjusted for females
return (-targets * torch.log(logits) * pos_weight + (1 - targets) * - torch.log(1 - logits)).sum()

females = torch.zeros_like(targets)

for b in range(targets.shape[0]):
cor_idx = torch.nonzero(targets[b], as_tuple=True)[0]
for idx in genders:
if idx in cor_idx:
females[b][idx] = 1
return (-targets * torch.log(logits) * pos_weight + (1 - targets) * - torch.log(1 - logits)- (female_weight * females * torch.log(logits))).sum()

def weighted(self, logits, targets, pos_weight=2.5):
targets = targets.squeeze(1)
Expand Down Expand Up @@ -145,16 +153,17 @@ def ns_inverse_unigram_mini_batch(self, logits, targets, neg_samples=5):
if idx not in cor_idx:
random_samples[b][idx] = 1
return (-targets * torch.log(logits) - random_samples * torch.log(1 - logits)).sum()

def ns_fair_inverse_unigram_b(self, logits, targets, indexes, neg_samples=5, female_weight=1):
def ns_fair_inverse_unigram_b(self, logits_gender, targets, neg_samples=5, female_weight=1):
logits = logits_gender['pred']
genders = logits_gender['gender'].nonzero()[1]
targets = targets.squeeze(1)
logits = logits.squeeze(1)

random_samples = torch.zeros_like(targets)
n_paper_per_author = torch.sum(targets, dim=0) + 1
unigram = (n_paper_per_author / (targets.shape[0] + targets.shape[1])).cpu()

females = torch.zeros_like(targets)
female_ids = indexes['female_ids']

for b in range(targets.shape[0]):
rand = torch.rand(targets.shape[1])
Expand All @@ -165,10 +174,35 @@ def ns_fair_inverse_unigram_b(self, logits, targets, indexes, neg_samples=5, fem
for idx in k_neg_idx:
if idx not in cor_idx:
random_samples[b][idx] = 1
for idx in female_ids:
females[b][idx] = 1
for idx in genders:
if idx in cor_idx:
females[b][idx] = 1
return (-targets * torch.log(logits) - random_samples * torch.log(1 - logits) - (female_weight * females * torch.log(logits))).sum()

# def ns_fair_inverse_unigram_b(self, logits, targets, indexes, neg_samples=5, female_weight=1):
# targets = targets.squeeze(1)
# logits = logits.squeeze(1)
# random_samples = torch.zeros_like(targets)
# n_paper_per_author = torch.sum(targets, dim=0) + 1
# unigram = (n_paper_per_author / (targets.shape[0] + targets.shape[1])).cpu()
#
# females = torch.zeros_like(targets)
# female_ids = indexes['female_ids']
#
# for b in range(targets.shape[0]):
# rand = torch.rand(targets.shape[1])
# neg_rands = (rand > unigram) * 1
# neg_idx = torch.nonzero(torch.tensor(neg_rands), as_tuple=True)[0]
# k_neg_idx = np.random.choice(neg_idx, neg_samples)
# cor_idx = torch.nonzero(targets[b], as_tuple=True)[0]
# for idx in k_neg_idx:
# if idx not in cor_idx:
# random_samples[b][idx] = 1
# for idx in female_ids:
# # if idx in cor_idx:
# females[b][idx] = 1
# return (-targets * torch.log(logits) - random_samples * torch.log(1 - logits) - (female_weight * females * torch.log(logits))).sum()

def learn(self, splits, indexes, vecs, params, prev_model, output):

genders = vecs['gender']#female column idx
Expand Down Expand Up @@ -261,7 +295,7 @@ def learn(self, splits, indexes, vecs, params, prev_model, output):
y_ = self.forward(X)

if loss_type == 'normal':
loss = self.cross_entropy({'pred': y_, 'gender': vecs['gender']}, y, ns, nns, unigram, indexes)
loss = self.cross_entropy({'pred': y_, 'gender': vecs['gender']}, y, ns, nns, unigram)
elif loss_type == 'SL':
loss = criterion(y_.squeeze(1), y.squeeze(1), index)
elif loss_type == 'DP':
Expand All @@ -280,9 +314,9 @@ def learn(self, splits, indexes, vecs, params, prev_model, output):
self.train(False) # Set model to valid mode
y_ = self.forward(X)
if loss_type == 'normal' or loss_type == 'DP':
loss = self.cross_entropy(y_, y, ns, nns, unigram, indexes)
loss = self.cross_entropy({'pred': y_, 'gender': vecs['gender']}, y, ns, nns, unigram)
else:
loss = criterion(y_.squeeze(), y.squeeze(), index)
loss = criterion(y_.squeeze(), y.squeeze())
valid_running_loss += loss.item()
print(
f'Fold {foldidx}/{len(splits["folds"]) - 1}, Epoch {epoch}/{num_epochs - 1}, Minibatch {batch_idx}/{int(X_train.shape[0] / batch_size)}, Phase {phase}'
Expand Down
2 changes: 1 addition & 1 deletion src/mdl/ntf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def evaluate(self, model_path, splits, vecs, indexes, on_train_valid_set=False,
topk = 5
topk_indices = np.argsort(Y_, axis=1)[:, -topk:]

female_ids = indexes['female_ids']
female_ids = vecs['gender'].nonzero()[1]

female_presence_count = np.sum(np.isin(topk_indices, female_ids), axis=1)
if female_presence_count.ndim == 1:
Expand Down

0 comments on commit 9c9bee5

Please sign in to comment.