From 5eaaaecf6794be79e26e3efd92cca66098ac201a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 13 Oct 2023 15:35:41 -0400 Subject: [PATCH 01/21] Add object normalization for PubChem IDs in drugchemical conflation. --- src/createcompendia/drugchemical.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 1068a042..9976e71c 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -273,11 +273,21 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu x = line.strip().split('\t') subject = x[0] object = x[2] - #object is a PUBCHEM. It's by definition a clique_leader. + if subject in drug_rxcui_to_clique: subject = drug_rxcui_to_clique[subject] elif subject in chemical_rxcui_to_clique: subject = chemical_rxcui_to_clique[subject] + else: + raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") + + if object in drug_rxcui_to_clique: + object = drug_rxcui_to_clique[object] + elif object in chemical_rxcui_to_clique: + object = chemical_rxcui_to_clique[object] + else: + raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}") + pairs.append((subject, object)) print("glom") gloms = {} From 4bd38177750704a6021666dd074fbcc1fa4a6e5f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Oct 2023 15:48:59 -0400 Subject: [PATCH 02/21] Replaced exceptions with warnings. --- src/createcompendia/drugchemical.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 9976e71c..7a9c0fa1 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -279,14 +279,22 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu elif subject in chemical_rxcui_to_clique: subject = chemical_rxcui_to_clique[subject] else: - raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") + logging.warning( + f"Skipping subject-object pair ({subject}, {object}) because the subject isn't mapped to a RxCUI" + ) + continue + # raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") if object in drug_rxcui_to_clique: object = drug_rxcui_to_clique[object] elif object in chemical_rxcui_to_clique: object = chemical_rxcui_to_clique[object] else: - raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}") + logging.warning( + f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI" + ) + continue + # raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}") pairs.append((subject, object)) print("glom") From 122c5166f903b06608b225d57e618e3c288cccf9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 14 Oct 2023 16:04:03 -0400 Subject: [PATCH 03/21] Reverted one of the warnings back to an exception. --- src/createcompendia/drugchemical.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 7a9c0fa1..0cf68175 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -279,11 +279,7 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu elif subject in chemical_rxcui_to_clique: subject = chemical_rxcui_to_clique[subject] else: - logging.warning( - f"Skipping subject-object pair ({subject}, {object}) because the subject isn't mapped to a RxCUI" - ) - continue - # raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") + raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}") if object in drug_rxcui_to_clique: object = drug_rxcui_to_clique[object] From f5e93c7030ac9b197d252498148200bddf3ba61c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 3 Nov 2023 02:07:27 -0400 Subject: [PATCH 04/21] Prevent and log KeyError in anatomy. --- src/createcompendia/anatomy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 4edd2d0a..02724cae 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -1,3 +1,4 @@ +import logging from collections import defaultdict import src.datahandlers.obo as obo @@ -142,6 +143,9 @@ def create_typed_sets(eqsets,types): found = False for prefix in [GO, CL, UBERON]: if prefix in prefixes and not found: + if prefixes[prefix][0] not in types: + logging.warning(f"Could not find prefix {prefixes[prefix][0]} in {types}, skipping.") + continue mytype = types[prefixes[prefix][0]] typed_sets[mytype].add(equivalent_ids) found = True From 73bb820baf09548badece65d0d25dd928d94dd11 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 3 Nov 2023 02:11:26 -0400 Subject: [PATCH 05/21] Reverted anatomy hiding code. --- src/createcompendia/anatomy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 02724cae..21dfed12 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -143,9 +143,6 @@ def create_typed_sets(eqsets,types): found = False for prefix in [GO, CL, UBERON]: if prefix in prefixes and not found: - if prefixes[prefix][0] not in types: - logging.warning(f"Could not find prefix {prefixes[prefix][0]} in {types}, skipping.") - continue mytype = types[prefixes[prefix][0]] typed_sets[mytype].add(equivalent_ids) found = True From d295a942531f04bb0a69792ce28723e0488504eb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 3 Nov 2023 13:39:10 -0400 Subject: [PATCH 06/21] Moved UniProtKB downloads into Snakefile as wget commands. --- src/datahandlers/uniprotkb.py | 7 ------- src/snakefiles/datacollect.snakefile | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index 461356e1..d4c95eea 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -1,7 +1,5 @@ from src.babel_utils import pull_via_urllib, make_local_name -def pull_one_uniprotkb(which): - pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') def readlabels(which): swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta') @@ -17,11 +15,6 @@ def readlabels(which): swissprot_labels[uniprotid] = f'{name} ({which})' return swissprot_labels -def pull_uniprotkb(): - pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB') - for which in ['sprot','trembl']: - pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB') - def pull_uniprot_labels(sprotfile,tremblfile,fname): slabels = readlabels('sprot') tlabels = readlabels('trembl') diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index f9b32534..9dba1ad1 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -91,13 +91,20 @@ rule get_mods_labels: ### UniProtKB -rule get_uniprotkb: +rule get_uniprotkb_idmapping: output: - config['download_directory']+'/UniProtKB/uniprot_sprot.fasta', - config['download_directory']+'/UniProtKB/uniprot_trembl.fasta', - config['download_directory']+'/UniProtKB/idmapping.dat' - run: - uniprotkb.pull_uniprotkb() + idmapping = config['download_directory']+'/UniProtKB/idmapping.dat' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" + +rule get_uniprotkb_sprot: + output: + uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" + +rule get_uniprotkb_trembl: + output: + uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta' + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" rule get_uniprotkb_labels: input: From c99f4ff42daf242b4d2ae2c389e6516cf332abb9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 3 Nov 2023 15:09:37 -0400 Subject: [PATCH 07/21] Improved progress output so it can be mixed with other logs better. --- src/snakefiles/datacollect.snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 9dba1ad1..f822f18c 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -94,17 +94,17 @@ rule get_mods_labels: rule get_uniprotkb_idmapping: output: idmapping = config['download_directory']+'/UniProtKB/idmapping.dat' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" + shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" rule get_uniprotkb_sprot: output: uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" + shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" rule get_uniprotkb_trembl: output: uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" + shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" rule get_uniprotkb_labels: input: From e884477fab9b8c169c1d6eaba5f05d510c05bb0f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 5 Nov 2023 02:51:16 -0500 Subject: [PATCH 08/21] Deleted redundant output filename from pull_panther_pathways(). --- src/snakefiles/datacollect.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index f822f18c..36ad5afb 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -368,7 +368,7 @@ rule get_panther_pathways: output: outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt' run: - pantherpathways.pull_panther_pathways(output.outfile) + pantherpathways.pull_panther_pathways() rule get_panther_pathway_labels: input: From 61b3afa0f927b7808ae803acb109f2c89be1476a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 5 Nov 2023 02:56:50 -0500 Subject: [PATCH 09/21] Took out --progress=dot as this somehow makes it even harder to read. --- src/snakefiles/datacollect.snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 36ad5afb..1c34da59 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -94,17 +94,17 @@ rule get_mods_labels: rule get_uniprotkb_idmapping: output: idmapping = config['download_directory']+'/UniProtKB/idmapping.dat' - shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" rule get_uniprotkb_sprot: output: uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta' - shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" rule get_uniprotkb_trembl: output: uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta' - shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" rule get_uniprotkb_labels: input: From d7d4d6aa34d74b5c09917733e0c903a98723847e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 5 Nov 2023 03:00:57 -0500 Subject: [PATCH 10/21] Added `-k` to gunzip so we keep the gzip file in case it's needed. --- src/snakefiles/datacollect.snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 1c34da59..563d934a 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -94,17 +94,17 @@ rule get_mods_labels: rule get_uniprotkb_idmapping: output: idmapping = config['download_directory']+'/UniProtKB/idmapping.dat' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz""" rule get_uniprotkb_sprot: output: uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz""" rule get_uniprotkb_trembl: output: uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta' - shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz""" + shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz""" rule get_uniprotkb_labels: input: From c0315565d65ad7b43ab4a46860bdbbce11a472a7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 19 Nov 2023 22:52:54 -0500 Subject: [PATCH 11/21] Removed babel-private. --- kubernetes/babel-private.k8s.yaml | 19 ------------------- kubernetes/babel.k8s.yaml | 5 ----- 2 files changed, 24 deletions(-) delete mode 100644 kubernetes/babel-private.k8s.yaml diff --git a/kubernetes/babel-private.k8s.yaml b/kubernetes/babel-private.k8s.yaml deleted file mode 100644 index 41926028..00000000 --- a/kubernetes/babel-private.k8s.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Kubernetes file for setting up a private volume to use for Babel. -# -# This private volume is only needed to store some "private" data, such -# as UMLS files, that should not be included in the Docker image. -# The private volume only needs to be ~5Gi in size. - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: babel-private - labels: - app: babel -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 10Gi - storageClassName: basic diff --git a/kubernetes/babel.k8s.yaml b/kubernetes/babel.k8s.yaml index efce9715..eb4050fd 100644 --- a/kubernetes/babel.k8s.yaml +++ b/kubernetes/babel.k8s.yaml @@ -24,8 +24,6 @@ spec: command: [ "/bin/bash", "-c", "--" ] args: [ "while true; echo Running; do sleep 30; done;" ] volumeMounts: - - mountPath: "/code/babel/input_data/private" - name: babel-private - mountPath: "/code/babel/babel_downloads" name: babel-downloads - mountPath: "/code/babel/babel_outputs" @@ -38,9 +36,6 @@ spec: memory: "500G" cpu: "4" volumes: - - name: babel-private - persistentVolumeClaim: - claimName: babel-private - name: babel-downloads persistentVolumeClaim: claimName: babel-downloads From 2d89a01c3533ec0d943524dd588b075d0d5249a7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 19 Nov 2023 22:56:56 -0500 Subject: [PATCH 12/21] Upgraded UMLS and RxNorm versions. --- config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index ad38c949..489232be 100644 --- a/config.json +++ b/config.json @@ -5,8 +5,8 @@ "output_directory": "babel_outputs", "biolink_version": "3.5.4", - "umls_version": "2023AA", - "rxnorm_version": "08072023", + "umls_version": "2023AB", + "rxnorm_version": "11062023", "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"], "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"], From a85fb42f0378ea45d0399a53793f6d6f9c95122a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 26 Nov 2023 18:14:12 -0500 Subject: [PATCH 13/21] Switched HGNC to HTTP from FTP. The FTP server seems to be having issues right now. --- src/datahandlers/hgnc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py index 23fdd265..bc1a247a 100644 --- a/src/datahandlers/hgnc.py +++ b/src/datahandlers/hgnc.py @@ -1,9 +1,9 @@ -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib import json def pull_hgnc(): outfile='HGNC/hgnc_complete_set.json' - pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile) + pull_via_urllib('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile) def pull_hgnc_labels_and_synonyms(infile): with open(infile,'r') as data: From da88e7a5ef285da5569dbf258b5c50b5e881f5cf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 26 Nov 2023 18:34:08 -0500 Subject: [PATCH 14/21] Fixed FTP -> HTTP change for HGNC. --- src/datahandlers/hgnc.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py index bc1a247a..1776ee6d 100644 --- a/src/datahandlers/hgnc.py +++ b/src/datahandlers/hgnc.py @@ -1,9 +1,15 @@ -from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib +from src.babel_utils import make_local_name, pull_via_urllib import json def pull_hgnc(): - outfile='HGNC/hgnc_complete_set.json' - pull_via_urllib('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile) + # On 2023nov26, I would get an error trying to download this file using FTP on Python (although + # weirdly enough, I could download the file without any problem using macOS Finder). So I changed + # it to use HTTP instead. + pull_via_urllib( + 'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/', + 'hgnc_complete_set.json', + decompress=False, + subpath="HGNC") def pull_hgnc_labels_and_synonyms(infile): with open(infile,'r') as data: From 0c677be610e179d8465e50c57abcb4dbd4455256 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 30 Nov 2023 00:30:18 -0500 Subject: [PATCH 15/21] Added code to skip hgfemale_gene_ensembl. --- src/datahandlers/ensembl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py index 04b67e78..804284c3 100644 --- a/src/datahandlers/ensembl.py +++ b/src/datahandlers/ensembl.py @@ -12,11 +12,17 @@ # just what we need. def pull_ensembl(complete_file): f = find_datasets() + + skip_dataset_ids = {'hgfemale_gene_ensembl'} + cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'} for ds in f['Dataset_ID']: print(ds) + if ds in skip_dataset_ids: + print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}') + continue outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}') # Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our # config, and keep it up to date. Maybe you could have a job that gets the datasets and writes a dataset file, From 8fb47d4ef52d2b41506b2e8851dccb5f85df88fa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 30 Nov 2023 22:21:08 -0500 Subject: [PATCH 16/21] Improved debugging. --- src/createcompendia/protein.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 05fc705d..06bc6fbd 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile): dlpath = os.path.join(ensembl_dir, dl) if os.path.isdir(dlpath): infname = os.path.join(dlpath, 'BioMart.tsv') + print(f'write_ensembl_ids for input filename {infname}') if os.path.exists(infname): # open each ensembl file, find the id column, and put it in the output with open(infname, 'r') as inf: From e135f54fe56c73ece49f9d940a6f9bfe88339b85 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 1 Dec 2023 22:22:08 -0500 Subject: [PATCH 17/21] Removed unnecessary import. --- src/createcompendia/anatomy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 21dfed12..4edd2d0a 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -1,4 +1,3 @@ -import logging from collections import defaultdict import src.datahandlers.obo as obo From b0bdc968a1aad2d8220bb553789822d47aa3f9f2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 5 Sep 2023 17:28:40 -0400 Subject: [PATCH 18/21] First stab at generating a GeneProtein conflation. --- src/snakefiles/geneprotein.snakefile | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 199cc6fc..dfb9afc0 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -21,9 +21,20 @@ rule geneprotein_conflation: run: geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile) +rule geneprotein_conflated_synonyms: + input: + geneprotein_conflation=config['output_directory']+'/conflation/GeneProtein.txt', + gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), + protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']), + output: + geneprotein_conflated=config['output_directory']+'/synonyms/GeneProteinConflated.txt' + run: + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated) + rule geneprotein: input: - config['output_directory']+'/conflation/GeneProtein.txt' + config['output_directory']+'/conflation/GeneProtein.txt', + config['output_directory']+'/synonyms/GeneProteinConflated.txt' output: x=config['output_directory']+'/reports/geneprotein_done' shell: From 92c141cde76008b420863f547d4e3a59dd76c934 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 5 Sep 2023 17:30:32 -0400 Subject: [PATCH 19/21] Improved output variable name. --- src/snakefiles/geneprotein.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index dfb9afc0..07a54b7d 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -27,9 +27,9 @@ rule geneprotein_conflated_synonyms: gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']), output: - geneprotein_conflated=config['output_directory']+'/synonyms/GeneProteinConflated.txt' + geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' run: - synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated) + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated_synonyms) rule geneprotein: input: From 617fd1299d74e8024b464f8a7c393ba8f88a7116 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 5 Sep 2023 17:44:32 -0400 Subject: [PATCH 20/21] Fixed typos. --- src/snakefiles/geneprotein.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 07a54b7d..e436b8ef 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -29,7 +29,7 @@ rule geneprotein_conflated_synonyms: output: geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' run: - synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated_synonyms) + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output.geneprotein_conflated_synonyms) rule geneprotein: input: From 2eb9ac7aaad7783519fddd830de7f0fb0d3fc648 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 5 Sep 2023 17:49:03 -0400 Subject: [PATCH 21/21] Fixed requirement for conflations to be a list. --- src/snakefiles/geneprotein.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index e436b8ef..a89ecc64 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -23,13 +23,13 @@ rule geneprotein_conflation: rule geneprotein_conflated_synonyms: input: - geneprotein_conflation=config['output_directory']+'/conflation/GeneProtein.txt', + geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'], gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']), output: geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' run: - synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output.geneprotein_conflated_synonyms) + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms) rule geneprotein: input: