Skip to content

Commit

Permalink
Merge pull request #34 from yogeswarl/main
Browse files Browse the repository at this point in the history
Added GitHub actions to lint python code in repositories
  • Loading branch information
yogeswarl authored Jul 6, 2023
2 parents 80b1278 + e48fd16 commit 923e1da
Show file tree
Hide file tree
Showing 17 changed files with 170 additions and 150 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: testing
on: [push]
env:
APPLICATION_NAME : WORKFLOW
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Python environment
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: Install requirements
run : pip install --quiet --requirement testing_reqs.txt
- name: Lint code
run: |
flake8 --ignore=E117,E127,E128,E231,E401,E501,E722,E701,E704,F401,F523,F841 . --exclude src/cair/,src/mdl/
# pylint --disable=C0301 --disable=C0326 *.py
# - name: Run unit tests
# run: python -m unittest --verbose --failfast
32 changes: 16 additions & 16 deletions output/toy.aol-ia/t5.small.local.docs.query.title/param.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

settings = {
'cmd': ['agg', 'box'],# steps of pipeline, ['pair', 'finetune', 'predict', 'search', 'eval','agg', 'box']
'cmd': ['agg', 'box'], # steps of pipeline, ['pair', 'finetune', 'predict', 'search', 'eval','agg', 'box']
'ncore': multiprocessing.cpu_count(),
't5model': 'small.local',#'base.gc', 'small.local'
'iter': 5, #number of finetuning iteration for t5
'nchanges': 5, #number of changes to a query
'ranker': 'bm25', #'qld', 'bm25'
'batch': None, #search per batch of queries for IR search using pyserini, if None, search per query
'topk': 10, #number of retrieved documents for a query
't5model': 'small.local', # 'base.gc', 'small.local'
'iter': 5, # number of finetuning iteration for t5
'nchanges': 5, # number of changes to a query
'ranker': 'bm25', # 'qld', 'bm25'
'batch': None, # search per batch of queries for IR search using pyserini, if None, search per query
'topk': 10, # number of retrieved documents for a query
'metric': 'map', # any valid trec_eval.9.0.4 metric like map, ndcg, recip_rank, ...
'treclib': f'"./trec_eval.9.0.4/trec_eval{extension}"',#in non-windows, remove .exe, also for pytrec_eval, 'pytrec'
'treclib': f'"./trec_eval.9.0.4/trec_eval{extension}"', # in non-windows, remove .exe, also for pytrec_eval, 'pytrec'
'msmarco.passage': {
'index_item': ['passage'],
'index': '../data/raw/msmarco.passage/lucene-index.msmarco-v1-passage.20220131.9ea315/',
'pairing': [None, 'docs', 'query'], #[context={msmarco does not have userinfo}, input={query, doc, doc(s)}, output={query, doc, doc(s)}], s means concat of docs
'lseq':{"inputs": 32, "targets": 256}, #query length and doc length for t5 model,
'pairing': [None, 'docs', 'query'], # [context={msmarco does not have userinfo}, input={query, doc, doc(s)}, output={query, doc, doc(s)}], s means concat of docs
'lseq':{"inputs": 32, "targets": 256}, # query length and doc length for t5 model,
},
'aol-ia': {
'index_item': ['title'], # ['url'], ['title', 'url'], ['title', 'url', 'text']
'index': f'../data/raw/aol-ia/lucene-index/title/',
'pairing': [None, 'docs', 'query'], #[context={2 scenarios, one with userID and one without userID). input={'userid','query','doc(s)'} output={'query','doc(s)'}
'lseq':{"inputs": 32, "targets": 256}, #query length and doc length for t5 model,
'filter': {'minql': 1, 'mindocl': 10}# [min query length, min doc length], after merge queries with relevant 'index_item', if |query| <= minql drop the row, if |'index_item'| < mindocl, drop row
'index_item': ['title'], # ['url'], ['title', 'url'], ['title', 'url', 'text']
'index': '../data/raw/aol-ia/lucene-index/title/',
'pairing': [None, 'docs', 'query'], # [context={2 scenarios, one with userID and one without userID). input={'userid','query','doc(s)'} output={'query','doc(s)'}
'lseq':{"inputs": 32, "targets": 256}, # query length and doc length for t5 model,
'filter': {'minql': 1, 'mindocl': 10} # [min query length, min doc length], after merge queries with relevant 'index_item', if |query| <= minql drop the row, if |'index_item'| < mindocl, drop row
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
'batch': 100, # search per batch of queries for IR search using pyserini, if None, search per query
'topk': 10, # number of retrieved documents for a query
'metric': 'map', # any valid trec_eval.9.0.4 metric like map, ndcg, recip_rank, ...
'treclib': f'"./trec_eval.9.0.4/trec_eval{extension}"',#in non-windows, remove .exe, also for pytrec_eval, 'pytrec'
'treclib': f'"./trec_eval.9.0.4/trec_eval{extension}"', # in non-windows, remove .exe, also for pytrec_eval, 'pytrec'
'msmarco.passage': {
'index_item': ['passage'],
'index': '../data/raw/msmarco.passage/lucene-index.msmarco-v1-passage.20220131.9ea315/',
'pairing': [None, 'docs', 'query'], # [context={msmarco does not have userinfo}, input={query, doc, doc(s)}, output={query, doc, doc(s)}], s means concat of docs
'lseq': {"inputs": 32, "targets": 256}, # query length and doc length for t5 model,
},
'aol-ia': {
'index_item': ['title'], # ['url'], ['title', 'url'], ['title', 'url', 'text']
'index_item': ['title'], # ['url'], ['title', 'url'], ['title', 'url', 'text']
'index': '../data/raw/aol-ia/lucene-index/title/',
'pairing': [None, 'docs', 'query'], # [context={2 scenarios, one with userID and one without userID). input={'userid','query','doc(s)'} output={'query','doc(s)'}
'lseq': {"inputs": 32, "targets": 256}, # query length and doc length for t5 model,
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ pandas
pyserini
ir_datasets
filesplit
flake8
pylint
3 changes: 2 additions & 1 deletion src/cmn/lucenex.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys, subprocess, os


def lucenex(corpus, output, ncore):
"""
common code to create index using the subprocess module
Expand All @@ -13,4 +14,4 @@ def lucenex(corpus, output, ncore):
'--index', output,
'--generator', 'DefaultLuceneDocumentGenerator',
'--threads', str(ncore), '--storePositions', '--storeDocvectors', '--storeRaw', '--optimize'])
print(f'Finished creating index.')
print('Finished creating index.')
6 changes: 3 additions & 3 deletions src/cmn/refiner.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import pandas as pd

#creates a train_test_split using pandas
# creates a train_test_split using pandas
datasets = ['diamond', 'platinum', 'gold']

def train_test_split(input,train_split = 0.8):

def train_test_split(input,train_split=0.8):
for ds in datasets:
refiner_ds = pd.read_csv(f'{input}/{ds}.tsv', sep='\t', encoding='utf-8', names=['qid', 'query', 'map', 'query_', 'map_'])
train = refiner_ds.sample(frac=train_split, random_state=200)
test = refiner_ds.drop(train.index)
train.to_csv(f'{input}/{ds}.train.tsv', sep='\t', index=False, header=False, columns=['query', 'query_'])
test.to_csv(f'{input}/{ds}.test.tsv', sep='\t', index=False, header=False, columns=['query', 'query_'])
print(f'saving {ds} with {train_split * 100}% train split and {int(1 - train_split) * 100}% test split at {input} ')

20 changes: 9 additions & 11 deletions src/dal/aol.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
from tqdm import tqdm
from shutil import copyfile
from ftfy import fix_text
tqdm.pandas()

from pyserini.search.lucene import LuceneSearcher

from dal.ds import Dataset
tqdm.pandas()


class Aol(Dataset):

Expand All @@ -16,8 +15,8 @@ def __init__(self, settings, homedir, ncore):

@classmethod
def _build_index(cls, homedir, index_item, indexdir, ncore):
print(f"Creating index from scratch using ir-dataset ...")
#https://github.com/allenai/ir_datasets
print("Creating index from scratch using ir-dataset ...")
# https://github.com/allenai/ir_datasets
os.environ['IR_DATASETS_HOME'] = '/'.join(homedir.split('/')[:-1])
if not os.path.isdir(os.environ['IR_DATASETS_HOME']): os.makedirs(os.environ['IR_DATASETS_HOME'])
index_item_str = '.'.join(index_item)
Expand All @@ -30,12 +29,12 @@ def _build_index(cls, homedir, index_item, indexdir, ncore):
print('Getting queries and qrels ...')
# the column order in the file is [qid, uid, did, uid]!!!! STUPID!!
qrels = pd.DataFrame.from_records(aolia.qrels_iter(), columns=['qid', 'did', 'rel', 'uid'], nrows=1) # namedtuple<query_id, doc_id, relevance, iteration>
queries = pd.DataFrame.from_records(aolia.queries_iter(), columns=['qid', 'query'], nrows=1)# namedtuple<query_id, text>
queries = pd.DataFrame.from_records(aolia.queries_iter(), columns=['qid', 'query'], nrows=1) # namedtuple<query_id, text>

print('Creating jsonl collections for indexing ...')
print(f'Raw documents should be downloaded already at {homedir}/aol-ia/downloaded_docs/ as explained here: https://github.com/terrierteam/aolia-tools')
print(f'But it had bugs: https://github.com/allenai/ir_datasets/issues/222')
print(f'Sean MacAvaney provided us with the downloaded_docs.tar file. Thanks Sean!')
print('But it had bugs: https://github.com/allenai/ir_datasets/issues/222')
print('Sean MacAvaney provided us with the downloaded_docs.tar file. Thanks Sean!')

Aol.create_jsonl(aolia, index_item, f'{homedir}/{cls.user_pairing}{index_item_str}')
if len(os.listdir(f'{indexdir}/{cls.user_pairing}{index_item_str}')) == 0:
Expand Down Expand Up @@ -87,7 +86,7 @@ def pair(cls, input, output, cat=True):
# queries_qrels.drop_duplicates(subset=['qid', 'did','pid'], inplace=True) # two users with same click for same query
if not cls.user_pairing: queries_qrels['uid'] = -1
queries_qrels['ctx'] = ''
queries_qrels.dropna(inplace=True) #empty doctxt, query, ...
queries_qrels.dropna(inplace=True) # empty doctxt, query, ...
queries_qrels.drop(queries_qrels[queries_qrels['query'].str.strip().str.len() <= Dataset.settings['filter']['minql']].index, inplace=True)
queries_qrels.drop(queries_qrels[queries_qrels[doccol].str.strip().str.len() < Dataset.settings["filter"]['mindocl']].index, inplace=True) # remove qrels whose docs are less than mindocl
queries_qrels.drop_duplicates(subset=['qid', 'did'], inplace=True)
Expand All @@ -100,7 +99,7 @@ def pair(cls, input, output, cat=True):
if cls.user_pairing: qrels = pd.read_csv(f'{input}/{cls.user_pairing}qrels.train.tsv_', sep='\t', index_col=False, names=['qid', 'uid', 'did', 'rel'])
batch_size = 1000000 # need to make this dynamic
index_item_str = '.'.join(cls.settings['index_item'])
## create dirs:
# create dirs:
if not os.path.isdir(f'../output/aol-ia/{cls.user_pairing}t5.base.gc.docs.query.{index_item_str}/original_test_queries'): os.makedirs(f'../output/aol-ia/{cls.user_pairing}t5.base.gc.docs.query.{index_item_str}/original_test_queries')
if not os.path.isdir(f'../output/aol-ia/{cls.user_pairing}t5.base.gc.docs.query.{index_item_str}/qrels'): os.makedirs(f'../output/aol-ia/{cls.user_pairing}t5.base.gc.docs.query.{index_item_str}/qrels')
if len(queries_qrels) > batch_size:
Expand All @@ -112,4 +111,3 @@ def pair(cls, input, output, cat=True):
qrels_splits.to_csv(f'../output/aol-ia/{cls.user_pairing}t5.base.gc.docs.query.{index_item_str}/qrels/qrels.splits.{_}.tsv_', sep='\t',
encoding='utf-8', index=False, header=False, columns=['qid', 'uid', 'did', 'rel'])
return queries_qrels

10 changes: 2 additions & 8 deletions src/dal/ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pyserini.search.lucene import LuceneSearcher
from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder


class Dataset(object):
searcher = None
settings = None
Expand All @@ -26,7 +27,7 @@ def _txt(cls, pid):
# The``docid`` is overloaded: if it is of type ``str``, it is treated as an external collection ``docid``;
# if it is of type ``int``, it is treated as an internal Lucene``docid``. # stupid!!
try:return json.loads(cls.searcher.doc(str(pid)).raw())['contents'].lower()
except AttributeError: return '' #if Dataset.searcher.doc(str(pid)) is None
except AttributeError: return '' # if Dataset.searcher.doc(str(pid)) is None
except Exception as e: raise e

@classmethod
Expand Down Expand Up @@ -87,7 +88,6 @@ def _docids(row):
hits = cls.searcher.search(row.query, k=topk, remove_dups=True)
for i, h in enumerate(hits): o.write(f'{qids[row.name]}\tQ0\t{h.docid:7}\t{i + 1:2}\t{h.score:.5f}\tPyserini\n')


queries.progress_apply(_docids, axis=1)

@classmethod
Expand Down Expand Up @@ -124,7 +124,6 @@ def aggregate(cls, original, changes, output, is_large_ds=False):
agg_all.write(f'{row.qid}\t{change}\t{query}\t{metric_value}\n')
if metric_value > 0 and metric_value >= row[f'original.{ranker}.{metric}']: agg_gold.write(f'{row.qid}\t{change}\t{query}\t{metric_value}\n')


@classmethod
def box(cls, input, qrels, output, checks):
ranker = input.columns[-1].split('.')[0] # e.g., bm25.success.10 => bm25
Expand Down Expand Up @@ -153,8 +152,3 @@ def box(cls, input, qrels, output, checks):
print(f'{c} has {df.shape[0]} queries')
qrels = df.merge(qrels, on='qid', how='inner')
qrels.to_csv(f'{output}/{c}.qrels.tsv', sep='\t', encoding='utf-8', index=False, header=False, columns=['qid', 'did', 'pid', 'rel'])





7 changes: 3 additions & 4 deletions src/dal/msmarco.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from os.path import isfile,join
import pandas as pd
from tqdm import tqdm
from dal.ds import Dataset
tqdm.pandas()

from dal.ds import Dataset

class MsMarcoPsg(Dataset):

Expand All @@ -15,12 +15,11 @@ def pair(cls, input, output, cat=True):
queries = pd.read_csv(f'{input}/queries.train.tsv', sep='\t', index_col=False, names=['qid', 'query'], converters={'query': str.lower}, header=None)
qrels = pd.read_csv(f'{input}/qrels.train.tsv', sep='\t', index_col=False, names=['qid', 'did', 'pid', 'relevancy'], header=None)
qrels.drop_duplicates(subset=['qid', 'pid'], inplace=True) # qrels have duplicates!!
qrels.to_csv(f'{input}/qrels.train.tsv_', index=False, sep='\t', header=False) #trec_eval.9.0.4 does not accept duplicate rows!!
qrels.to_csv(f'{input}/qrels.train.tsv_', index=False, sep='\t', header=False) # trec_eval.9.0.4 does not accept duplicate rows!!
queries_qrels = pd.merge(queries, qrels, on='qid', how='inner', copy=False)
doccol = 'docs' if cat else 'doc'
queries_qrels[doccol] = queries_qrels['pid'].progress_apply(cls._txt) #100%|██████████| 532761/532761 [00:32<00:00, 16448.77it/s]
queries_qrels[doccol] = queries_qrels['pid'].progress_apply(cls._txt) # 100%|██████████| 532761/532761 [00:32<00:00, 16448.77it/s]
queries_qrels['ctx'] = ''
if cat: queries_qrels = queries_qrels.groupby(['qid', 'query'], as_index=False, observed=True).agg({'did': list, 'pid': list, doccol: ' '.join})
queries_qrels.to_csv(output, sep='\t', encoding='utf-8', index=False)
return queries_qrels

Loading

0 comments on commit 923e1da

Please sign in to comment.