Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GeneProtein synonym conflation #185

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"output_directory": "babel_outputs",

"biolink_version": "3.5.4",
"umls_version": "2023AA",
"rxnorm_version": "08072023",
"umls_version": "2023AB",
"rxnorm_version": "11062023",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
Expand Down
19 changes: 0 additions & 19 deletions kubernetes/babel-private.k8s.yaml

This file was deleted.

5 changes: 0 additions & 5 deletions kubernetes/babel.k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ spec:
command: [ "/bin/bash", "-c", "--" ]
args: [ "while true; echo Running; do sleep 30; done;" ]
volumeMounts:
- mountPath: "/code/babel/input_data/private"
name: babel-private
- mountPath: "/code/babel/babel_downloads"
name: babel-downloads
- mountPath: "/code/babel/babel_outputs"
Expand All @@ -38,9 +36,6 @@ spec:
memory: "500G"
cpu: "4"
volumes:
- name: babel-private
persistentVolumeClaim:
claimName: babel-private
- name: babel-downloads
persistentVolumeClaim:
claimName: babel-downloads
Expand Down
16 changes: 15 additions & 1 deletion src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,25 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
x = line.strip().split('\t')
subject = x[0]
object = x[2]
#object is a PUBCHEM. It's by definition a clique_leader.

if subject in drug_rxcui_to_clique:
subject = drug_rxcui_to_clique[subject]
elif subject in chemical_rxcui_to_clique:
subject = chemical_rxcui_to_clique[subject]
else:
raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")

if object in drug_rxcui_to_clique:
object = drug_rxcui_to_clique[object]
elif object in chemical_rxcui_to_clique:
object = chemical_rxcui_to_clique[object]
else:
logging.warning(
f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI"
)
continue
# raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")

pairs.append((subject, object))
print("glom")
gloms = {}
Expand Down
1 change: 1 addition & 0 deletions src/createcompendia/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
dlpath = os.path.join(ensembl_dir, dl)
if os.path.isdir(dlpath):
infname = os.path.join(dlpath, 'BioMart.tsv')
print(f'write_ensembl_ids for input filename {infname}')
if os.path.exists(infname):
# open each ensembl file, find the id column, and put it in the output
with open(infname, 'r') as inf:
Expand Down
6 changes: 6 additions & 0 deletions src/datahandlers/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
# just what we need.
def pull_ensembl(complete_file):
f = find_datasets()

skip_dataset_ids = {'hgfemale_gene_ensembl'}

cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
"external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
for ds in f['Dataset_ID']:
print(ds)
if ds in skip_dataset_ids:
print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
continue
outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
# Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
# config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file,
Expand Down
12 changes: 9 additions & 3 deletions src/datahandlers/hgnc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name, pull_via_urllib
import json

def pull_hgnc():
outfile='HGNC/hgnc_complete_set.json'
pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
# On 2023nov26, I would get an error trying to download this file using FTP on Python (although
# weirdly enough, I could download the file without any problem using macOS Finder). So I changed
# it to use HTTP instead.
pull_via_urllib(
'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
'hgnc_complete_set.json',
decompress=False,
subpath="HGNC")

def pull_hgnc_labels_and_synonyms(infile):
with open(infile,'r') as data:
Expand Down
7 changes: 0 additions & 7 deletions src/datahandlers/uniprotkb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from src.babel_utils import pull_via_urllib, make_local_name

def pull_one_uniprotkb(which):
pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def readlabels(which):
swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
Expand All @@ -17,11 +15,6 @@ def readlabels(which):
swissprot_labels[uniprotid] = f'{name} ({which})'
return swissprot_labels

def pull_uniprotkb():
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
for which in ['sprot','trembl']:
pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')

def pull_uniprot_labels(sprotfile,tremblfile,fname):
slabels = readlabels('sprot')
tlabels = readlabels('trembl')
Expand Down
21 changes: 14 additions & 7 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,20 @@ rule get_mods_labels:

### UniProtKB

rule get_uniprotkb:
rule get_uniprotkb_idmapping:
output:
config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
config['download_directory']+'/UniProtKB/idmapping.dat'
run:
uniprotkb.pull_uniprotkb()
idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz"""

rule get_uniprotkb_sprot:
output:
uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz"""

rule get_uniprotkb_trembl:
output:
uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz"""

rule get_uniprotkb_labels:
input:
Expand Down Expand Up @@ -361,7 +368,7 @@ rule get_panther_pathways:
output:
outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
run:
pantherpathways.pull_panther_pathways(output.outfile)
pantherpathways.pull_panther_pathways()

rule get_panther_pathway_labels:
input:
Expand Down
13 changes: 12 additions & 1 deletion src/snakefiles/geneprotein.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,20 @@ rule geneprotein_conflation:
run:
geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)

rule geneprotein_conflated_synonyms:
input:
geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
output:
geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
run:
synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms)

rule geneprotein:
input:
config['output_directory']+'/conflation/GeneProtein.txt'
config['output_directory']+'/conflation/GeneProtein.txt',
config['output_directory']+'/synonyms/GeneProteinConflated.txt'
output:
x=config['output_directory']+'/reports/geneprotein_done'
shell:
Expand Down