From 5eaaaecf6794be79e26e3efd92cca66098ac201a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 13 Oct 2023 15:35:41 -0400
Subject: [PATCH 01/21] Add object normalization for PubChem IDs in
 drugchemical conflation.

---
 src/createcompendia/drugchemical.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
index 1068a042..9976e71c 100644
--- a/src/createcompendia/drugchemical.py
+++ b/src/createcompendia/drugchemical.py
@@ -273,11 +273,21 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
             x = line.strip().split('\t')
             subject = x[0]
             object = x[2]
-            #object is a PUBCHEM.  It's by definition a clique_leader.
+
             if subject in drug_rxcui_to_clique:
                 subject = drug_rxcui_to_clique[subject]
             elif subject in chemical_rxcui_to_clique:
                 subject = chemical_rxcui_to_clique[subject]
+            else:
+                raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
+
+            if object in drug_rxcui_to_clique:
+                object = drug_rxcui_to_clique[object]
+            elif object in chemical_rxcui_to_clique:
+                object = chemical_rxcui_to_clique[object]
+            else:
+                raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")
+
             pairs.append((subject, object))
     print("glom")
     gloms = {}

From 4bd38177750704a6021666dd074fbcc1fa4a6e5f Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sat, 14 Oct 2023 15:48:59 -0400
Subject: [PATCH 02/21] Replaced exceptions with warnings.

---
 src/createcompendia/drugchemical.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
index 9976e71c..7a9c0fa1 100644
--- a/src/createcompendia/drugchemical.py
+++ b/src/createcompendia/drugchemical.py
@@ -279,14 +279,22 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
             elif subject in chemical_rxcui_to_clique:
                 subject = chemical_rxcui_to_clique[subject]
             else:
-                raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
+                logging.warning(
+                    f"Skipping subject-object pair ({subject}, {object}) because the subject isn't mapped to a RxCUI"
+                )
+                continue
+                # raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
 
             if object in drug_rxcui_to_clique:
                 object = drug_rxcui_to_clique[object]
             elif object in chemical_rxcui_to_clique:
                 object = chemical_rxcui_to_clique[object]
             else:
-                raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")
+                logging.warning(
+                    f"Skipping subject-object pair ({subject}, {object}) because the object isn't mapped to a RxCUI"
+                )
+                continue
+                # raise RuntimeError(f"Unknown identifier in drugchemical conflation as object: {object}")
 
             pairs.append((subject, object))
     print("glom")

From 122c5166f903b06608b225d57e618e3c288cccf9 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sat, 14 Oct 2023 16:04:03 -0400
Subject: [PATCH 03/21] Reverted one of the warnings back to an exception.

---
 src/createcompendia/drugchemical.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
index 7a9c0fa1..0cf68175 100644
--- a/src/createcompendia/drugchemical.py
+++ b/src/createcompendia/drugchemical.py
@@ -279,11 +279,7 @@ def build_conflation(rxn_concord,umls_concord,pubchem_rxn_concord,drug_compendiu
             elif subject in chemical_rxcui_to_clique:
                 subject = chemical_rxcui_to_clique[subject]
             else:
-                logging.warning(
-                    f"Skipping subject-object pair ({subject}, {object}) because the subject isn't mapped to a RxCUI"
-                )
-                continue
-                # raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
+                raise RuntimeError(f"Unknown identifier in drugchemical conflation as subject: {subject}")
 
             if object in drug_rxcui_to_clique:
                 object = drug_rxcui_to_clique[object]

From f5e93c7030ac9b197d252498148200bddf3ba61c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 3 Nov 2023 02:07:27 -0400
Subject: [PATCH 04/21] Prevent and log KeyError in anatomy.

---
 src/createcompendia/anatomy.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
index 4edd2d0a..02724cae 100644
--- a/src/createcompendia/anatomy.py
+++ b/src/createcompendia/anatomy.py
@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 
 import src.datahandlers.obo as obo
@@ -142,6 +143,9 @@ def create_typed_sets(eqsets,types):
         found  = False
         for prefix in [GO, CL, UBERON]:
             if prefix in prefixes and not found:
+                if prefixes[prefix][0] not in types:
+                    logging.warning(f"Could not find prefix {prefixes[prefix][0]} in {types}, skipping.")
+                    continue
                 mytype = types[prefixes[prefix][0]]
                 typed_sets[mytype].add(equivalent_ids)
                 found = True

From 73bb820baf09548badece65d0d25dd928d94dd11 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 3 Nov 2023 02:11:26 -0400
Subject: [PATCH 05/21] Reverted anatomy hiding code.

---
 src/createcompendia/anatomy.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
index 02724cae..21dfed12 100644
--- a/src/createcompendia/anatomy.py
+++ b/src/createcompendia/anatomy.py
@@ -143,9 +143,6 @@ def create_typed_sets(eqsets,types):
         found  = False
         for prefix in [GO, CL, UBERON]:
             if prefix in prefixes and not found:
-                if prefixes[prefix][0] not in types:
-                    logging.warning(f"Could not find prefix {prefixes[prefix][0]} in {types}, skipping.")
-                    continue
                 mytype = types[prefixes[prefix][0]]
                 typed_sets[mytype].add(equivalent_ids)
                 found = True

From d295a942531f04bb0a69792ce28723e0488504eb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 3 Nov 2023 13:39:10 -0400
Subject: [PATCH 06/21] Moved UniProtKB downloads into Snakefile as wget
 commands.

---
 src/datahandlers/uniprotkb.py        |  7 -------
 src/snakefiles/datacollect.snakefile | 19 +++++++++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py
index 461356e1..d4c95eea 100644
--- a/src/datahandlers/uniprotkb.py
+++ b/src/datahandlers/uniprotkb.py
@@ -1,7 +1,5 @@
 from src.babel_utils import pull_via_urllib, make_local_name
 
-def pull_one_uniprotkb(which):
-    pull_via_urllib('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')
 
 def readlabels(which):
     swissname = make_local_name(f'UniProtKB/uniprot_{which}.fasta')
@@ -17,11 +15,6 @@ def readlabels(which):
                 swissprot_labels[uniprotid] = f'{name} ({which})'
     return swissprot_labels
 
-def pull_uniprotkb():
-    pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/',f'idmapping.dat.gz',subpath='UniProtKB')
-    for which in ['sprot','trembl']:
-        pull_via_urllib('https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',subpath='UniProtKB')
-
 def pull_uniprot_labels(sprotfile,tremblfile,fname):
     slabels = readlabels('sprot')
     tlabels = readlabels('trembl')
diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index f9b32534..9dba1ad1 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -91,13 +91,20 @@ rule get_mods_labels:
 
 ### UniProtKB
 
-rule get_uniprotkb:
+rule get_uniprotkb_idmapping:
     output:
-        config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
-        config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
-        config['download_directory']+'/UniProtKB/idmapping.dat'
-    run:
-        uniprotkb.pull_uniprotkb()
+        idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
+
+rule get_uniprotkb_sprot:
+    output:
+        uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
+
+rule get_uniprotkb_trembl:
+    output:
+        uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
 
 rule get_uniprotkb_labels:
     input:

From c99f4ff42daf242b4d2ae2c389e6516cf332abb9 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 3 Nov 2023 15:09:37 -0400
Subject: [PATCH 07/21] Improved progress output so it can be mixed with other
 logs better.

---
 src/snakefiles/datacollect.snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index 9dba1ad1..f822f18c 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -94,17 +94,17 @@ rule get_mods_labels:
 rule get_uniprotkb_idmapping:
     output:
         idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
+    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
 
 rule get_uniprotkb_sprot:
     output:
         uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
+    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
 
 rule get_uniprotkb_trembl:
     output:
         uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
+    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
 
 rule get_uniprotkb_labels:
     input:

From e884477fab9b8c169c1d6eaba5f05d510c05bb0f Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 5 Nov 2023 02:51:16 -0500
Subject: [PATCH 08/21] Deleted redundant output filename from
 pull_panther_pathways().

---
 src/snakefiles/datacollect.snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index f822f18c..36ad5afb 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -368,7 +368,7 @@ rule get_panther_pathways:
     output:
         outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
     run:
-        pantherpathways.pull_panther_pathways(output.outfile)
+        pantherpathways.pull_panther_pathways()
 
 rule get_panther_pathway_labels:
     input:

From 61b3afa0f927b7808ae803acb109f2c89be1476a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 5 Nov 2023 02:56:50 -0500
Subject: [PATCH 09/21] Took out --progress=dot as this somehow makes it even
 harder to read.

---
 src/snakefiles/datacollect.snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index 36ad5afb..1c34da59 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -94,17 +94,17 @@ rule get_mods_labels:
 rule get_uniprotkb_idmapping:
     output:
         idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
-    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
 
 rule get_uniprotkb_sprot:
     output:
         uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
-    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
 
 rule get_uniprotkb_trembl:
     output:
         uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
-    shell: """wget --continue --progress=dot --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
 
 rule get_uniprotkb_labels:
     input:

From d7d4d6aa34d74b5c09917733e0c903a98723847e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 5 Nov 2023 03:00:57 -0500
Subject: [PATCH 10/21] Added `-k` to gunzip so we keep the gzip file in case
 it's needed.

---
 src/snakefiles/datacollect.snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index 1c34da59..563d934a 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -94,17 +94,17 @@ rule get_mods_labels:
 rule get_uniprotkb_idmapping:
     output:
         idmapping = config['download_directory']+'/UniProtKB/idmapping.dat'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip {output.idmapping}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz" -O {output.idmapping}.gz && gunzip -k {output.idmapping}.gz"""
 
 rule get_uniprotkb_sprot:
     output:
         uniprot_sprot = config['download_directory']+'/UniProtKB/uniprot_sprot.fasta'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip {output.uniprot_sprot}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -O {output.uniprot_sprot}.gz && gunzip -k {output.uniprot_sprot}.gz"""
 
 rule get_uniprotkb_trembl:
     output:
         uniprot_trembl = config['download_directory']+'/UniProtKB/uniprot_trembl.fasta'
-    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip {output.uniprot_trembl}.gz"""
+    shell: """wget --continue --tries=10 "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -O {output.uniprot_trembl}.gz && gunzip -k {output.uniprot_trembl}.gz"""
 
 rule get_uniprotkb_labels:
     input:

From c0315565d65ad7b43ab4a46860bdbbce11a472a7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 19 Nov 2023 22:52:54 -0500
Subject: [PATCH 11/21] Removed babel-private.

---
 kubernetes/babel-private.k8s.yaml | 19 -------------------
 kubernetes/babel.k8s.yaml         |  5 -----
 2 files changed, 24 deletions(-)
 delete mode 100644 kubernetes/babel-private.k8s.yaml

diff --git a/kubernetes/babel-private.k8s.yaml b/kubernetes/babel-private.k8s.yaml
deleted file mode 100644
index 41926028..00000000
--- a/kubernetes/babel-private.k8s.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Kubernetes file for setting up a private volume to use for Babel.
-#
-# This private volume is only needed to store some "private" data, such
-# as UMLS files, that should not be included in the Docker image.
-# The private volume only needs to be ~5Gi in size.
-
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: babel-private
-  labels:
-    app: babel
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 10Gi
-  storageClassName: basic
diff --git a/kubernetes/babel.k8s.yaml b/kubernetes/babel.k8s.yaml
index efce9715..eb4050fd 100644
--- a/kubernetes/babel.k8s.yaml
+++ b/kubernetes/babel.k8s.yaml
@@ -24,8 +24,6 @@ spec:
     command: [ "/bin/bash", "-c", "--" ]
     args: [ "while true; echo Running; do sleep 30; done;" ]
     volumeMounts:
-    - mountPath: "/code/babel/input_data/private"
-      name: babel-private
     - mountPath: "/code/babel/babel_downloads"
       name: babel-downloads
     - mountPath: "/code/babel/babel_outputs"
@@ -38,9 +36,6 @@ spec:
         memory: "500G"
         cpu: "4"
   volumes:
-    - name: babel-private
-      persistentVolumeClaim:
-        claimName: babel-private
     - name: babel-downloads
       persistentVolumeClaim:
         claimName: babel-downloads

From 2d89a01c3533ec0d943524dd588b075d0d5249a7 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 19 Nov 2023 22:56:56 -0500
Subject: [PATCH 12/21] Upgraded UMLS and RxNorm versions.

---
 config.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config.json b/config.json
index ad38c949..489232be 100644
--- a/config.json
+++ b/config.json
@@ -5,8 +5,8 @@
   "output_directory": "babel_outputs",
 
   "biolink_version": "3.5.4",
-  "umls_version": "2023AA",
-  "rxnorm_version": "08072023",
+  "umls_version": "2023AB",
+  "rxnorm_version": "11062023",
 
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],

From a85fb42f0378ea45d0399a53793f6d6f9c95122a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 26 Nov 2023 18:14:12 -0500
Subject: [PATCH 13/21] Switched HGNC to HTTP from FTP.

The FTP server seems to be having issues right now.
---
 src/datahandlers/hgnc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
index 23fdd265..bc1a247a 100644
--- a/src/datahandlers/hgnc.py
+++ b/src/datahandlers/hgnc.py
@@ -1,9 +1,9 @@
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
 import json
 
 def pull_hgnc():
     outfile='HGNC/hgnc_complete_set.json'
-    pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
+    pull_via_urllib('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
 
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data:

From da88e7a5ef285da5569dbf258b5c50b5e881f5cf Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Sun, 26 Nov 2023 18:34:08 -0500
Subject: [PATCH 14/21] Fixed FTP -> HTTP change for HGNC.

---
 src/datahandlers/hgnc.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
index bc1a247a..1776ee6d 100644
--- a/src/datahandlers/hgnc.py
+++ b/src/datahandlers/hgnc.py
@@ -1,9 +1,15 @@
-from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
+from src.babel_utils import make_local_name, pull_via_urllib
 import json
 
 def pull_hgnc():
-    outfile='HGNC/hgnc_complete_set.json'
-    pull_via_urllib('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json',outfilename=outfile)
+    # On 2023nov26, I would get an error trying to download this file using FTP on Python (although
+    # weirdly enough, I could download the file without any problem using macOS Finder). So I changed
+    # it to use HTTP instead.
+    pull_via_urllib(
+        'https://ftp.ebi.ac.uk/pub/databases/genenames/new/json/',
+        'hgnc_complete_set.json',
+        decompress=False,
+        subpath="HGNC")
 
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data:

From 0c677be610e179d8465e50c57abcb4dbd4455256 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 30 Nov 2023 00:30:18 -0500
Subject: [PATCH 15/21] Added code to skip hgfemale_gene_ensembl.

---
 src/datahandlers/ensembl.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py
index 04b67e78..804284c3 100644
--- a/src/datahandlers/ensembl.py
+++ b/src/datahandlers/ensembl.py
@@ -12,11 +12,17 @@
 # just what we need.
 def pull_ensembl(complete_file):
     f = find_datasets()
+
+    skip_dataset_ids = {'hgfemale_gene_ensembl'}
+
     cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
             "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
             'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
     for ds in f['Dataset_ID']:
         print(ds)
+        if ds in skip_dataset_ids:
+            print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
+            continue
         outfile = make_local_name('BioMart.tsv', subpath=f'ENSEMBL/{ds}')
         # Really, we should let snakemake handle this, but then we would need to put a list of all the 200+ sets in our
         # config, and keep it up to date.  Maybe you could have a job that gets the datasets and writes a dataset file,

From 8fb47d4ef52d2b41506b2e8851dccb5f85df88fa Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 30 Nov 2023 22:21:08 -0500
Subject: [PATCH 16/21] Improved debugging.

---
 src/createcompendia/protein.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
index 05fc705d..06bc6fbd 100644
--- a/src/createcompendia/protein.py
+++ b/src/createcompendia/protein.py
@@ -65,6 +65,7 @@ def write_ensembl_ids(ensembl_dir, outfile):
             dlpath = os.path.join(ensembl_dir, dl)
             if os.path.isdir(dlpath):
                 infname = os.path.join(dlpath, 'BioMart.tsv')
+                print(f'write_ensembl_ids for input filename {infname}')
                 if os.path.exists(infname):
                     # open each ensembl file, find the id column, and put it in the output
                     with open(infname, 'r') as inf:

From e135f54fe56c73ece49f9d940a6f9bfe88339b85 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 1 Dec 2023 22:22:08 -0500
Subject: [PATCH 17/21] Removed unnecessary import.

---
 src/createcompendia/anatomy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
index 21dfed12..4edd2d0a 100644
--- a/src/createcompendia/anatomy.py
+++ b/src/createcompendia/anatomy.py
@@ -1,4 +1,3 @@
-import logging
 from collections import defaultdict
 
 import src.datahandlers.obo as obo

From b0bdc968a1aad2d8220bb553789822d47aa3f9f2 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 5 Sep 2023 17:28:40 -0400
Subject: [PATCH 18/21] First stab at generating a GeneProtein conflation.

---
 src/snakefiles/geneprotein.snakefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile
index 199cc6fc..dfb9afc0 100644
--- a/src/snakefiles/geneprotein.snakefile
+++ b/src/snakefiles/geneprotein.snakefile
@@ -21,9 +21,20 @@ rule geneprotein_conflation:
     run:
         geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile)
 
+rule geneprotein_conflated_synonyms:
+    input:
+        geneprotein_conflation=config['output_directory']+'/conflation/GeneProtein.txt',
+        gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
+        protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
+    output:
+        geneprotein_conflated=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
+    run:
+        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated)
+
 rule geneprotein:
     input:
-        config['output_directory']+'/conflation/GeneProtein.txt'
+        config['output_directory']+'/conflation/GeneProtein.txt',
+        config['output_directory']+'/synonyms/GeneProteinConflated.txt'
     output:
         x=config['output_directory']+'/reports/geneprotein_done'
     shell:

From 92c141cde76008b420863f547d4e3a59dd76c934 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 5 Sep 2023 17:30:32 -0400
Subject: [PATCH 19/21] Improved output variable name.

---
 src/snakefiles/geneprotein.snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile
index dfb9afc0..07a54b7d 100644
--- a/src/snakefiles/geneprotein.snakefile
+++ b/src/snakefiles/geneprotein.snakefile
@@ -27,9 +27,9 @@ rule geneprotein_conflated_synonyms:
         gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
         protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
     output:
-        geneprotein_conflated=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
+        geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
     run:
-        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated)
+        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated_synonyms)
 
 rule geneprotein:
     input:

From 617fd1299d74e8024b464f8a7c393ba8f88a7116 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 5 Sep 2023 17:44:32 -0400
Subject: [PATCH 20/21] Fixed typos.

---
 src/snakefiles/geneprotein.snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile
index 07a54b7d..e436b8ef 100644
--- a/src/snakefiles/geneprotein.snakefile
+++ b/src/snakefiles/geneprotein.snakefile
@@ -29,7 +29,7 @@ rule geneprotein_conflated_synonyms:
     output:
         geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
     run:
-        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output=geneprotein_conflated_synonyms)
+        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output.geneprotein_conflated_synonyms)
 
 rule geneprotein:
     input:

From 2eb9ac7aaad7783519fddd830de7f0fb0d3fc648 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Tue, 5 Sep 2023 17:49:03 -0400
Subject: [PATCH 21/21] Fixed requirement for conflations to be a list.

---
 src/snakefiles/geneprotein.snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile
index e436b8ef..a89ecc64 100644
--- a/src/snakefiles/geneprotein.snakefile
+++ b/src/snakefiles/geneprotein.snakefile
@@ -23,13 +23,13 @@ rule geneprotein_conflation:
 
 rule geneprotein_conflated_synonyms:
     input:
-        geneprotein_conflation=config['output_directory']+'/conflation/GeneProtein.txt',
+        geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'],
         gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']),
         protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']),
     output:
         geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt'
     run:
-        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflation, output.geneprotein_conflated_synonyms)
+        synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms)
 
 rule geneprotein:
     input: