From 303786e95811848ee2b8268eb365234e9ce2857c Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Fri, 16 Feb 2024 18:25:53 -0800
Subject: [PATCH 1/7] Modified genes_biodomains transform to handle
 semicolon-separated Ensembl IDs, and edited the tests to test that case

---
 config.yaml                                   |  4 +-
 .../etl/transform/genes_biodomains.py         | 48 +++++++++++++++++++
 test_config.yaml                              |  4 +-
 .../input/biodomains_test_input.csv           |  4 +-
 ...domains_test_input_bad_but_should_pass.csv |  2 +-
 .../output/genes_biodomains.json              | 27 ++++++-----
 tests/transform/test_genes_biodomains.py      | 20 ++++++++
 7 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/config.yaml b/config.yaml
index b70165f9..abce93f6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -4,10 +4,10 @@ sources:
   - genes_biodomains:
     genes_biodomains_files: &genes_biodomains_files
       - name: genes_biodomains
-        id: syn44151254.1
+        id: syn44151254.4
         format: csv
     genes_biodomains_provenance: &genes_biodomains_provenance
-      - syn44151254.1
+      - syn44151254.4
   - overall_scores:
     overall_scores_files: &overall_scores_files
       - name: overall_scores
diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py
index 9d278cda..15c44784 100644
--- a/src/agoradatatools/etl/transform/genes_biodomains.py
+++ b/src/agoradatatools/etl/transform/genes_biodomains.py
@@ -37,6 +37,51 @@ def count_grouped_total(
     return df
 
 
+def split_ensembl_ids(genes_biodomains: pd.DataFrame) -> pd.DataFrame:
+    """The "ensembl_gene_id" column in the genes_biodomains data frame has some single Ensembl IDs and some rows with a
+    semicolon-separated list of Ensembl IDs. This function finds the rows with semicolons, adds rows to the
+    genes_biodomains dataframe such that there is one row per Ensembl ID in that list, and assigns a single Ensembl ID
+    to each row.
+
+    Args:
+        genes_biodomains (pd.DataFrame): DataFrame containing a column named "ensembl_gene_id"
+
+    Returns:
+        pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the
+                      "ensembl_gene_id" column only has one Ensembl ID per row.
+    """
+
+    # Split the whole column on ";". Rows that don't need to be split will have a length of 1, while rows that do need
+    # to be split will have 2 or more in the list.
+    ens_lists = genes_biodomains["ensembl_gene_id"].str.split(pat=";")
+    needs_split = ens_lists.apply(len) > 1
+
+    # Edit the rows where needs_split is True, referencing by the DataFrame index
+    for df_ind in needs_split.index[needs_split]:
+        ensembl_ids = ens_lists[df_ind]
+
+        # Guard against extra semicolons or ending the string with a semicolon, which will both result in a blank
+        # character as an Ensembl ID
+        ensembl_ids = [x for x in ensembl_ids if x != ""]
+
+        # If there is still more than one Ensembl ID in the list after removing '', add as many new rows as there are
+        # (Ensembl IDs - 1), since there is already 1 row in the data frame for this group of IDs
+        if len(ensembl_ids) > 1:
+            row_dupe = genes_biodomains.loc[df_ind].copy().to_frame().T
+
+            genes_biodomains = pd.concat(
+                [genes_biodomains] + [row_dupe] * (len(ensembl_ids) - 1)
+            )
+
+            # The added rows plus the original row all have the same index, so this sets all rows with that index at once.
+            genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids
+
+        else:
+            genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids[0]
+
+    return genes_biodomains
+
+
 def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
     """Takes dictionary of dataset DataFrames, extracts the genes_biodomains
     DataFrame, calculates some metrics on GO terms per gene / biodomain, and
@@ -55,6 +100,9 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
     interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"]
     genes_biodomains = genes_biodomains[interesting_columns].dropna()
 
+    genes_biodomains = split_ensembl_ids(genes_biodomains)
+    genes_biodomains = genes_biodomains.reset_index(drop=True)
+
     # Count the number of go_terms associated with each biodomain
     n_biodomain_terms = count_grouped_total(
         genes_biodomains, "biodomain", "go_terms", "n_biodomain_terms"
diff --git a/test_config.yaml b/test_config.yaml
index 9e0d7872..879a6d1d 100644
--- a/test_config.yaml
+++ b/test_config.yaml
@@ -4,10 +4,10 @@ sources:
   - genes_biodomains:
     genes_biodomains_files: &genes_biodomains_files
       - name: genes_biodomains
-        id: syn44151254.1
+        id: syn44151254.4
         format: csv
     genes_biodomains_provenance: &genes_biodomains_provenance
-      - syn44151254.1
+      - syn44151254.4
   - overall_scores:
     overall_scores_files: &overall_scores_files
       - name: overall_scores
diff --git a/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv b/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv
index 61233b9d..221f49f1 100644
--- a/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv
+++ b/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv
@@ -2,7 +2,7 @@ biodomain,abbr,label,color,go_id,go_terms,ensembl_gene_id
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0006914,autophagy,ENSG00000161011
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0016236,macroautophagy,ENSG00000161011
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011
-Autophagy,Au,Autophagy [Au],#9931fd,GO:0000423,mitophagy,ENSG00000161011
+Autophagy,Au,Autophagy [Au],#9931fd,GO:0000423,mitophagy,ENSG00000161011;ENSG00000000938
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0035973,aggrephagy,ENSG00000161011
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0038096,Fc-gamma receptor signaling pathway involved in phagocytosis,ENSG00000000938
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0050764,regulation of phagocytosis,ENSG00000000938
@@ -34,4 +34,4 @@ Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0070037,rRNA (pseudouridine) methyl
 Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0031625,ubiquitin protein ligase binding,ENSG00000161011
 Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0043130,ubiquitin binding,ENSG00000161011
 Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000161011
-Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000000938
+Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000000938;ENSG00000188157;ENSG00000290146
diff --git a/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv b/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv
index 659ff314..31360d47 100644
--- a/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv
+++ b/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv
@@ -1,7 +1,7 @@
 biodomain,abbr,label,color,go_id,go_terms,ensembl_gene_id
 ,Au,Autophagy [Au],#9931fd,GO:0006914,autophagy,ENSG00000161011
 Autophagy,,Autophagy [Au],#9931fd,GO:0016236,macroautophagy,ENSG00000161011
-Autophagy,Au,,#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011
+Autophagy,Au,,#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011;
 Autophagy,Au,Autophagy [Au],,GO:0000423,mitophagy,ENSG00000161011
 Autophagy,Au,Autophagy [Au],#9931fd,,aggrephagy,ENSG00000161011
 Autophagy,Au,Autophagy [Au],#9931fd,GO:0038096,,ENSG00000000938
diff --git a/tests/test_assets/genes_biodomains/output/genes_biodomains.json b/tests/test_assets/genes_biodomains/output/genes_biodomains.json
index 04eab60c..74a4bd63 100644
--- a/tests/test_assets/genes_biodomains/output/genes_biodomains.json
+++ b/tests/test_assets/genes_biodomains/output/genes_biodomains.json
@@ -6,11 +6,12 @@
         "biodomain": "Autophagy",
         "go_terms": [
           "Fc-gamma receptor signaling pathway involved in phagocytosis",
-          "regulation of phagocytosis"
+          "regulation of phagocytosis",
+          "mitophagy"
         ],
         "n_biodomain_terms": 10,
-        "n_gene_biodomain_terms": 2,
-        "pct_linking_terms": 28.57
+        "n_gene_biodomain_terms": 3,
+        "pct_linking_terms": 37.50
       },
       {
         "biodomain": "Mitochondrial Metabolism",
@@ -21,7 +22,7 @@
         ],
         "n_biodomain_terms": 7,
         "n_gene_biodomain_terms": 3,
-        "pct_linking_terms": 42.86
+        "pct_linking_terms": 37.50
       },
       {
         "biodomain": "Proteostasis",
@@ -30,7 +31,7 @@
         ],
         "n_biodomain_terms": 8,
         "n_gene_biodomain_terms": 1,
-        "pct_linking_terms": 14.29
+        "pct_linking_terms": 12.50
       },
       {
         "biodomain": "Synapse",
@@ -39,7 +40,7 @@
         ],
         "n_biodomain_terms": 9,
         "n_gene_biodomain_terms": 1,
-        "pct_linking_terms": 14.29
+        "pct_linking_terms": 12.50
       }
     ]
   },
@@ -105,11 +106,12 @@
       {
         "biodomain": "Proteostasis",
         "go_terms": [
-          "Golgi lumen"
+          "Golgi lumen",
+          "aggresome"
         ],
         "n_biodomain_terms": 8,
-        "n_gene_biodomain_terms": 1,
-        "pct_linking_terms": 14.29
+        "n_gene_biodomain_terms": 2,
+        "pct_linking_terms": 25.00
       },
       {
         "biodomain": "Synapse",
@@ -123,7 +125,7 @@
         ],
         "n_biodomain_terms": 9,
         "n_gene_biodomain_terms": 6,
-        "pct_linking_terms": 85.71
+        "pct_linking_terms": 75.00
       }
     ]
   },
@@ -135,10 +137,11 @@
         "go_terms": [
           "ribosome biogenesis",
           "rRNA base methylation",
-          "rRNA (pseudouridine) methyltransferase activity"
+          "rRNA (pseudouridine) methyltransferase activity",
+          "aggresome"
         ],
         "n_biodomain_terms": 8,
-        "n_gene_biodomain_terms": 3,
+        "n_gene_biodomain_terms": 4,
         "pct_linking_terms": 100.0
       }
     ]
diff --git a/tests/transform/test_genes_biodomains.py b/tests/transform/test_genes_biodomains.py
index 799853c8..fa5d976b 100644
--- a/tests/transform/test_genes_biodomains.py
+++ b/tests/transform/test_genes_biodomains.py
@@ -43,6 +43,26 @@ def test_count_grouped_total_two_groups(self):
         assert counted.equals(expected_df)
 
 
+def test_split_ensembl_ids():
+    input_df = pd.DataFrame(
+        {
+            "ensembl_gene_id": ["a", "a;d", "b;", "b;c;d;e;f"],  # 'Ensembl IDs'
+            "col_2": ["x", "y", "z", "x"],  # 3 'biodomains'
+            "col_3": ["1", "2", "3", "4"],  # 4 'go_terms'
+        }
+    )
+    expected_df = pd.DataFrame(
+        {
+            "ensembl_gene_id": ["a", "a", "b", "b", "d", "c", "d", "e", "f"],
+            "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"],
+            "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"],
+        }
+    )
+    output = genes_biodomains.split_ensembl_ids(genes_biodomains=input_df)
+    output = output.reset_index(drop=True)  # reset needed so indices match
+    assert output.equals(expected_df)
+
+
 class TestTransformGenesBiodomains:
     data_files_path = "tests/test_assets/genes_biodomains"
     pass_test_data = [

From ca08eff1ea10152d33dc610efd2ed55f65a0c867 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Tue, 20 Feb 2024 16:48:29 -0800
Subject: [PATCH 2/7] Moved split_ensembl_ids function from biodomains
 transform to utils file, made it more generic, updated its test, plus
 sonarcloud delinting

---
 src/agoradatatools/etl/transform/gene_info.py |  42 +++++--
 .../etl/transform/genes_biodomains.py         |  85 +++----------
 src/agoradatatools/etl/utils.py               |  65 +++++++++-
 tests/test_utils.py                           | 112 ++++++++++++++----
 tests/transform/test_genes_biodomains.py      |  36 ++----
 5 files changed, 211 insertions(+), 129 deletions(-)

diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py
index bc30dcd4..62a06d8d 100644
--- a/src/agoradatatools/etl/transform/gene_info.py
+++ b/src/agoradatatools/etl/transform/gene_info.py
@@ -1,12 +1,12 @@
 import numpy as np
 import pandas as pd
 
-from agoradatatools.etl.utils import nest_fields
+from agoradatatools.etl.utils import nest_fields, split_delimited_field_to_multiple_rows
 
 
 def transform_gene_info(
-    datasets: dict, adjusted_p_value_threshold, protein_level_threshold
-):
+    datasets: dict, adjusted_p_value_threshold: float, protein_level_threshold: float
+) -> pd.DataFrame:
     """
     This function will perform transformations and incrementally create a dataset called gene_info.
     Each dataset will be left_joined onto gene_info, starting with gene_metadata.
@@ -81,6 +81,7 @@ def transform_gene_info(
         drop_columns=["ensembl_gene_id"],
     )
 
+    biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"])
     biodomains = (
         biodomains.groupby("ensembl_gene_id")["biodomain"]
         .apply(set)  # ensure unique biodomain names
@@ -89,6 +90,10 @@ def transform_gene_info(
         .rename(columns={"biodomain": "biodomains"})
     )
 
+    biodomains = split_delimited_field_to_multiple_rows(
+        df=biodomains, split_field="ensembl_gene_id", delim=";"
+    )
+
     # sort biodomains list alphabetically
     biodomains["biodomains"] = biodomains["biodomains"].apply(sorted)
 
@@ -97,12 +102,19 @@ def transform_gene_info(
     # tep_info file and not the symbol in gene_info, because there are some mismatches
     # between the two and the hgnc_symbol from tep_info is the correct one to use here.
     # resource_url should be NA if both is_adi and is_tep are false.
-    resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22"
+    resource_url_prefix = (
+        "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22"
+        + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22"
+        + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table."
+        + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22"
+    )
     resource_url_suffix = "%22%5D%7D%5D%7D"
     tep_info["resource_url"] = tep_info.apply(
-        lambda row: resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix
-        if row["is_adi"] or row["is_tep"]
-        else np.NaN,
+        lambda row: (
+            resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix
+            if row["is_adi"] or row["is_tep"]
+            else np.NaN
+        ),
         axis=1,
     )
 
@@ -161,9 +173,11 @@ def transform_gene_info(
 
     # fillna doesn't work for creating an empty array, need this function instead
     gene_info["alias"] = gene_info.apply(
-        lambda row: row["alias"]
-        if isinstance(row["alias"], np.ndarray)
-        else np.ndarray(0, dtype=object),
+        lambda row: (
+            row["alias"]
+            if isinstance(row["alias"], np.ndarray)
+            else np.ndarray(0, dtype=object)
+        ),
         axis=1,
     )
 
@@ -179,9 +193,11 @@ def transform_gene_info(
 
     # create 'total_nominations' field
     gene_info["total_nominations"] = gene_info.apply(
-        lambda row: len(row["target_nominations"])
-        if isinstance(row["target_nominations"], list)
-        else np.NaN,
+        lambda row: (
+            len(row["target_nominations"])
+            if isinstance(row["target_nominations"], list)
+            else np.NaN
+        ),
         axis=1,
     )
 
diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py
index 15c44784..8d778c7c 100644
--- a/src/agoradatatools/etl/transform/genes_biodomains.py
+++ b/src/agoradatatools/etl/transform/genes_biodomains.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from agoradatatools.etl.utils import nest_fields
+from agoradatatools.etl.utils import nest_fields, split_delimited_field_to_multiple_rows
 
 
 def count_grouped_total(
@@ -11,22 +11,19 @@ def count_grouped_total(
     input_colname: str,
     output_colname: str,
 ) -> pd.DataFrame:
-    """For each unique item/combination in the column(s) specified by grouping,
-    counts the number of unique items in the column [input_colname] that
-    correspond to that grouping. The calculated counts are put in a new
-    column and named with [output_colname].
+    """For each unique item/combination in the column(s) specified by grouping, counts the number of unique items in the
+    column [input_colname] that correspond to that grouping. The calculated counts are put in a new column and named
+    with [output_colname].
+
     Args:
-        df (pd.DataFrame): contains columns listed in grouping and
-                           input_colname. May contain other columns as well, but
+        df (pd.DataFrame): contains columns listed in grouping and input_colname. May contain other columns as well, but
                            these will be dropped from the returned data frame.
-        grouping (str or list): a string with a single column name, or a list of
-                                strings for multiple column names
+        grouping (str or list): a string with a single column name, or a list of strings for multiple column names
         input_colname (str): the name of the column to count
         output_colname (str): the name of the new column with calculated counts
     Returns:
-        pd.DataFrame: a data frame containing the grouping column(s) and a
-                      new column for output_colname, which contains the count of
-                      unique items in input_colname for each grouping item.
+        pd.DataFrame: a data frame containing the grouping column(s) and a new column for output_colname, which contains
+                      the count of unique items in input_colname for each grouping item.
     """
     df = (
         df.groupby(grouping)[input_colname]
@@ -37,71 +34,25 @@ def count_grouped_total(
     return df
 
 
-def split_ensembl_ids(genes_biodomains: pd.DataFrame) -> pd.DataFrame:
-    """The "ensembl_gene_id" column in the genes_biodomains data frame has some single Ensembl IDs and some rows with a
-    semicolon-separated list of Ensembl IDs. This function finds the rows with semicolons, adds rows to the
-    genes_biodomains dataframe such that there is one row per Ensembl ID in that list, and assigns a single Ensembl ID
-    to each row.
-
-    Args:
-        genes_biodomains (pd.DataFrame): DataFrame containing a column named "ensembl_gene_id"
-
-    Returns:
-        pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the
-                      "ensembl_gene_id" column only has one Ensembl ID per row.
-    """
-
-    # Split the whole column on ";". Rows that don't need to be split will have a length of 1, while rows that do need
-    # to be split will have 2 or more in the list.
-    ens_lists = genes_biodomains["ensembl_gene_id"].str.split(pat=";")
-    needs_split = ens_lists.apply(len) > 1
-
-    # Edit the rows where needs_split is True, referencing by the DataFrame index
-    for df_ind in needs_split.index[needs_split]:
-        ensembl_ids = ens_lists[df_ind]
-
-        # Guard against extra semicolons or ending the string with a semicolon, which will both result in a blank
-        # character as an Ensembl ID
-        ensembl_ids = [x for x in ensembl_ids if x != ""]
-
-        # If there is still more than one Ensembl ID in the list after removing '', add as many new rows as there are
-        # (Ensembl IDs - 1), since there is already 1 row in the data frame for this group of IDs
-        if len(ensembl_ids) > 1:
-            row_dupe = genes_biodomains.loc[df_ind].copy().to_frame().T
-
-            genes_biodomains = pd.concat(
-                [genes_biodomains] + [row_dupe] * (len(ensembl_ids) - 1)
-            )
-
-            # The added rows plus the original row all have the same index, so this sets all rows with that index at once.
-            genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids
-
-        else:
-            genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids[0]
-
-    return genes_biodomains
-
-
 def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
-    """Takes dictionary of dataset DataFrames, extracts the genes_biodomains
-    DataFrame, calculates some metrics on GO terms per gene / biodomain, and
-    performs nest_fields on the final DataFrame. This results in a 2 column
-    DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested
-    dictionary field "gene_biodomains"
+    """Takes dictionary of dataset DataFrames, extracts the genes_biodomains DataFrame, calculates some metrics on GO
+    terms per gene / biodomain, and performs nest_fields on the final DataFrame. This results in a 2 column DataFrame
+    grouped by "ensembl_gene_id" and includes a collapsed nested dictionary field "gene_biodomains"
 
     Args:
         datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame
 
     Returns:
-        pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including
-                      a collapsed nested dictionary field "gene_biodomains"
+        pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including a collapsed nested dictionary field
+                      "gene_biodomains"
     """
     genes_biodomains = datasets["genes_biodomains"]
     interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"]
     genes_biodomains = genes_biodomains[interesting_columns].dropna()
 
-    genes_biodomains = split_ensembl_ids(genes_biodomains)
-    genes_biodomains = genes_biodomains.reset_index(drop=True)
+    genes_biodomains = split_delimited_field_to_multiple_rows(
+        df=genes_biodomains, split_field="ensembl_gene_id", delim=";"
+    )
 
     # Count the number of go_terms associated with each biodomain
     n_biodomain_terms = count_grouped_total(
@@ -151,7 +102,7 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame:
         df=genes_biodomains,
         grouping="ensembl_gene_id",
         new_column="gene_biodomains",
-        drop_columns="ensembl_gene_id",
+        drop_columns=["ensembl_gene_id"],
     )
 
     return genes_biodomains
diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py
index 073abe65..b228ea55 100644
--- a/src/agoradatatools/etl/utils.py
+++ b/src/agoradatatools/etl/utils.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Pattern
 
 import numpy as np
 import pandas as pd
@@ -110,7 +110,6 @@ def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame:
         df.rename(columns=column_map, inplace=True)
     except TypeError:
         print("Column mapping must be a dictionary")
-        return df
 
     return df
 
@@ -180,7 +179,7 @@ def calculate_distribution(
 
     Args:
         df (pd.DataFrame): the DataFrame to calculate distribution for
-        grouping (str or list of str): the column(s) to group the data frame on (example: "tissue" or ["tissue", "model"])
+        grouping (str or list[str]): the column(s) to group the data frame on (example: "tissue" or ["tissue", "model"])
         distribution_column (str): the name of the column to calculate distribution on (example: "logfc")
 
     Returns:
@@ -215,3 +214,63 @@ def calculate_distribution(
     df.drop("IQR", axis=1, inplace=True)
 
     return df
+
+
+def split_delimited_field_to_multiple_rows(
+    df: pd.DataFrame, split_field: str, delim: Union[str, Pattern]
+) -> pd.DataFrame:
+    """This function takes a dataframe with a column that contains delimiter-separated strings in some or all rows
+    (instead of a single value), splits those strings on the delimiter, and expands the dataframe so that each item in
+    the resulting list has its own row. For each row containing a delimiter-separated string in the target column, this
+    function creates duplicate rows for each item in that list, with identical data in the other columns. Then the
+    target column for these duplicate rows (plus the original row) is assigned a single value from the list, resulting
+    in one row per item in the former list.
+    An example of where this function is needed: the genes_biodomains dataset has some semicolon-separated Ensembl IDs
+    in its ensembl_gene_id field, in addition to rows with a single Ensembl ID in the field. For rows with a list of
+    Ensembl IDs, the field is split on ";" and the function creates duplicate rows for each Ensembl ID in the list. Then
+    the ensembl_gene_id field for these duplicates is re-assigned so that there is one Ensembl ID per row.
+
+    Args:
+        df (pd.DataFrame): the DataFrame containing a column with delimiter-separated strings. The column can contain a
+                           combination of rows with single values and rows with delimited strings. Every row in the
+                           column should be a string, not a Python list.
+        split_field (str): the name of the column with the strings to split up
+        delim (str or Pattern): the delimiter to split the column values on. This may be a string with a single
+                                character (e.g. ","), a string of multiple characters (e.g. ", "), or a compiled regex
+                                (e.g. re.compile("[,;-_]"))
+
+    Returns:
+        pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the
+                      split_field column only has one value per row.
+    """
+
+    # Split the whole column on the delimiter. Rows that don't need to be split will have a list of length of 1, while
+    # rows that do need to be split will have 2 or more in the list.
+    split_lists = df[split_field].str.split(pat=delim)
+    needs_split = split_lists.apply(len) > 1
+
+    # Edit the rows where needs_split is True, referencing by the DataFrame index
+    for df_ind in needs_split.index[needs_split]:
+        split_items = split_lists[df_ind]
+
+        # Guard against extra delimiters or ending the string with a delimiter, which will both result in a blank
+        # character as a list item
+        split_items = [x for x in split_items if x != ""]
+
+        # If there is still more than one item in the list after removing '', add as many new rows as there are
+        # (items - 1), since there is already 1 row in the data frame for this group of items
+        if len(split_items) > 1:
+            row_dupe = df.loc[df_ind].copy().to_frame().T
+
+            df = pd.concat([df] + [row_dupe] * (len(split_items) - 1))
+
+            # The added rows plus the original row all have the same index, so this sets all rows with that index at
+            # once.
+            df.at[df_ind, split_field] = split_items
+
+        # Otherwise change the value in split_field to the only item left in the list, which will erase the extra
+        # delimiters
+        else:
+            df.at[df_ind, split_field] = split_items[0]
+
+    return df.reset_index(drop=True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d205dce9..9f3dbe25 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,62 +8,63 @@
 import pytest
 import synapseclient
 import yaml
+import re
 
 from agoradatatools.etl import utils
 
 
 class TestLoginToSynapse:
     @pytest.fixture(scope="function", autouse=True)
-    def setup_method(self, syn):
+    def setup_method(self, syn: synapseclient.Synapse) -> None:
         self.patch_synapseclient = patch.object(
             synapseclient, "Synapse", return_value=syn
         ).start()
         self.patch_syn_login = patch.object(syn, "login", return_value=syn).start()
 
-    def teardown_method(self):
+    def teardown_method(self) -> None:
         mock.patch.stopall()
 
-    def test_login_with_token(self):
+    def test_login_with_token(self) -> None:
         utils._login_to_synapse(token="my_auth_token")
         self.patch_synapseclient.assert_called_once()
         self.patch_syn_login.assert_called_once_with(authToken="my_auth_token")
 
-    def test_login_no_token(self):
+    def test_login_no_token(self) -> None:
         utils._login_to_synapse(token=None)
         self.patch_synapseclient.assert_called_once()
         self.patch_syn_login.assert_called_once_with()
 
 
-def test_get_config_with_invalid_file_path():
+def test_get_config_with_invalid_file_path() -> None:
     with pytest.raises(FileNotFoundError, match="File not found. *"):
         utils._get_config(config_path="this/is/a/bad/path")
 
 
-def test_get_config_with_parser_error():
+def test_get_config_with_parser_error() -> None:
     with pytest.raises(
         yaml.parser.ParserError, match="YAML file unable to be parsed. *"
     ):
         utils._get_config(config_path="./tests/test_assets/bad_config_parsing.yaml")
 
 
-def test_get_config_with_scanner_error():
+def test_get_config_with_scanner_error() -> None:
     with pytest.raises(
         yaml.scanner.ScannerError, match="YAML file unable to be scanned. *"
     ):
         utils._get_config(config_path="./tests/test_assets/bad_config_scanning.yaml")
 
 
-def test_get_config_with_no_config_path():
+def test_get_config_with_no_config_path() -> None:
     config = utils._get_config(config_path=None)
     assert config["destination"] == "syn12177492"
 
 
-def test_get_config_with_config_path():
+def test_get_config_with_config_path() -> None:
     config = utils._get_config(config_path="./test_config.yaml")
     assert config["destination"] == "syn17015333"
 
 
-def test_standardize_column_names():
+def test_standardize_column_names() -> None:
     df = pd.DataFrame(
         {
             "a#": ["test_value"],
@@ -117,12 +118,12 @@ class TestStandardizeValues:
         }
     )
 
-    def test_standardize_values_success(self):
+    def test_standardize_values_success(self) -> None:
         standard_df = utils.standardize_values(df=self.df.copy())
         for value in standard_df.iloc[0].tolist():
             assert np.isnan(value)
 
-    def test_standardize_values_TypeError(self):
+    def test_standardize_values_TypeError(self) -> None:
         with patch.object(pd.DataFrame, "replace") as patch_replace:
             patch_replace.side_effect = TypeError
             captured_output = StringIO()
@@ -144,13 +145,13 @@ class TestRenameColumns:
     good_column_map = {"a": "e", "b": "f", "c": "g", "d": "h"}
     bad_column_map = []
 
-    def test_rename_columns_success(self):
+    def test_rename_columns_success(self) -> None:
         renamed_df = utils.rename_columns(
             df=self.df.copy(), column_map=self.good_column_map
         )
         assert list(renamed_df.columns) == list(self.good_column_map.values())
 
-    def test_rename_columns_TypeError(self):
+    def test_rename_columns_TypeError(self) -> None:
         captured_output = StringIO()
         sys.stdout = captured_output
         bad_renamed_df = utils.rename_columns(
@@ -182,7 +183,7 @@ class TestNestFields:
         }
     )
 
-    def test_nest_fields_with_dropped_column(self):
+    def test_nest_fields_with_dropped_column(self) -> None:
         expected_column_e = [
             [
                 {"a": "group_1", "b": "1", "c": "1"},
@@ -203,7 +204,7 @@ def test_nest_fields_with_dropped_column(self):
         )
         assert list(nested_df["e"]) == expected_column_e
 
-    def test_nest_fields_with_dropped_column_list(self):
+    def test_nest_fields_with_dropped_column_list(self) -> None:
         expected_column_e = [
             [
                 {"a": "group_1", "c": "1"},
@@ -224,7 +225,7 @@ def test_nest_fields_with_dropped_column_list(self):
         )
         assert list(nested_df["e"]) == expected_column_e
 
-    def test_nest_fields_no_drop_column(self):
+    def test_nest_fields_no_drop_column(self) -> None:
         expected_column_e = [
             [
                 {"a": "group_1", "b": "1", "c": "1", "d": "1"},
@@ -243,7 +244,7 @@ def test_nest_fields_no_drop_column(self):
         nested_df = utils.nest_fields(df=self.df_multirow, grouping="a", new_column="e")
         assert list(nested_df["e"]) == expected_column_e
 
-    def test_nest_fields_multirow_ValueError(self):
+    def test_nest_fields_multirow_ValueError(self) -> None:
         with pytest.raises(ValueError, match="nested_field_is_list *"):
             utils.nest_fields(
                 df=self.df_multirow,
@@ -253,7 +254,7 @@ def test_nest_fields_multirow_ValueError(self):
                 nested_field_is_list=False,
             )
 
-    def test_nest_fields_singlerow_nested_list_false(self):
+    def test_nest_fields_singlerow_nested_list_false(self) -> None:
         expected_column_e = [
             {"a": "group_1", "b": "1", "c": "1"},
             {"a": "group_2", "b": "1", "c": "1"},
@@ -337,7 +338,7 @@ class TestCalculateDistribution:
     )
 
     # Stats on "col_3", grouped by "col_1" only
-    def test_calculate_distribution_one_group(self):
+    def test_calculate_distribution_one_group(self) -> None:
         expected_df = pd.DataFrame(
             {
                 "col_1": ["a", "b", "c"],
@@ -354,7 +355,7 @@ def test_calculate_distribution_one_group(self):
         assert output_df.equals(expected_df)
 
     # Stats on "col_3", grouped by "col_1" and "col_2"
-    def test_calculate_distribution_two_groups(self):
+    def test_calculate_distribution_two_groups(self) -> None:
         expected_df = pd.DataFrame(
             {
                 "col_1": ["a", "a", "b", "c", "c"],
@@ -370,3 +371,72 @@ def test_calculate_distribution_two_groups(self):
             df=self.df, grouping=["col_1", "col_2"], distribution_column="col_3"
         )
         assert output_df.equals(expected_df)
+
+
+class TestSplitDelimitedFieldToMultipleRows:
+    """Tests the split_delimited_field_to_multiple_rows function with the 'delim' argument being either a string or a
+    compiled regex. It also tests the case where this function is called but no values need to be split up.
+    """
+
+    expected_df = pd.DataFrame(
+        {
+            "col_1": ["a", "a", "b", "b", "d", "c", "d", "e f", "g"],
+            "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"],
+            "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"],
+        }
+    )
+
+    def test_split_delimited_field_to_multiple_rows_with_character_delim(self) -> None:
+        input_df = pd.DataFrame(
+            {
+                "col_1": ["a", "a;d", "b;", "b;c;d;e f;g"],  # 'Ensembl IDs'
+                "col_2": ["x", "y", "z", "x"],  # 3 'biodomains'
+                "col_3": ["1", "2", "3", "4"],  # 4 'go_terms'
+            }
+        )
+
+        output = utils.split_delimited_field_to_multiple_rows(
+            df=input_df, split_field="col_1", delim=";"
+        )
+
+        assert output.equals(self.expected_df)
+
+    def test_split_delimited_field_to_multiple_rows_with_string_delim(self) -> None:
+        # The ", " pattern should split on the ", " but not the space in "e f"
+        input_df = pd.DataFrame(
+            {
+                "col_1": ["a", "a, d", "b, ", "b, c, d, e f, g"],  # 'Ensembl IDs'
+                "col_2": ["x", "y", "z", "x"],  # 3 'biodomains'
+                "col_3": ["1", "2", "3", "4"],  # 4 'go_terms'
+            }
+        )
+
+        output = utils.split_delimited_field_to_multiple_rows(
+            df=input_df, split_field="col_1", delim=", "
+        )
+
+        assert output.equals(self.expected_df)
+
+    def test_split_delimited_field_to_multiple_rows_with_regex_delim(self) -> None:
+        input_df = pd.DataFrame(
+            {
+                "col_1": ["a", "a;d", "b-", "b_c_d;e f;g"],  # 'Ensembl IDs'
+                "col_2": ["x", "y", "z", "x"],  # 3 'biodomains'
+                "col_3": ["1", "2", "3", "4"],  # 4 'go_terms'
+            }
+        )
+
+        pattern = re.compile("[;_-]")
+        output = utils.split_delimited_field_to_multiple_rows(
+            df=input_df, split_field="col_1", delim=pattern
+        )
+
+        assert output.equals(self.expected_df)
+
+    def test_split_delimited_field_to_multiple_rows_with_no_split(self) -> None:
+        input_df = self.expected_df.copy()
+        output = utils.split_delimited_field_to_multiple_rows(
+            df=input_df, split_field="col_1", delim=";"
+        )
+
+        assert output.equals(self.expected_df)
diff --git a/tests/transform/test_genes_biodomains.py b/tests/transform/test_genes_biodomains.py
index fa5d976b..c05af11c 100644
--- a/tests/transform/test_genes_biodomains.py
+++ b/tests/transform/test_genes_biodomains.py
@@ -1,3 +1,4 @@
+""" Integration tests for the genes_biodomains transform """
 import os
 
 import pandas as pd
@@ -7,6 +8,8 @@
 
 
 class TestCountGroupedTotal:
+    """Tests the count_grouped_total method two ways: grouping by one column only, and grouping by two columns."""
+
     df = pd.DataFrame(
         {
             "col_1": ["a", "a", "a", "b", "c", "c", "c"],  # 3 'Ensembl IDs'
@@ -17,7 +20,7 @@ class TestCountGroupedTotal:
     )
 
     # How many unique "col_2"'s per unique "col_1" value?
-    def test_count_grouped_total_one_group(self):
+    def test_count_grouped_total_one_group(self) -> None:
         expected_df = pd.DataFrame({"col_1": ["a", "b", "c"], "output": [3, 1, 2]})
         counted = genes_biodomains.count_grouped_total(
             df=self.df, grouping="col_1", input_colname="col_2", output_colname="output"
@@ -25,7 +28,7 @@ def test_count_grouped_total_one_group(self):
         assert counted.equals(expected_df)
 
     # How many unique "col_3"'s per unique combination of "col_1" + "col_2"?
-    def test_count_grouped_total_two_groups(self):
+    def test_count_grouped_total_two_groups(self) -> None:
         expected_df = pd.DataFrame(
             {
                 "col_1": ["a", "a", "a", "b", "c", "c"],
@@ -43,27 +46,10 @@ def test_count_grouped_total_two_groups(self):
         assert counted.equals(expected_df)
 
 
-def test_split_ensembl_ids():
-    input_df = pd.DataFrame(
-        {
-            "ensembl_gene_id": ["a", "a;d", "b;", "b;c;d;e;f"],  # 'Ensembl IDs'
-            "col_2": ["x", "y", "z", "x"],  # 3 'biodomains'
-            "col_3": ["1", "2", "3", "4"],  # 4 'go_terms'
-        }
-    )
-    expected_df = pd.DataFrame(
-        {
-            "ensembl_gene_id": ["a", "a", "b", "b", "d", "c", "d", "e", "f"],
-            "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"],
-            "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"],
-        }
-    )
-    output = genes_biodomains.split_ensembl_ids(genes_biodomains=input_df)
-    output = output.reset_index(drop=True)  # reset needed so indices match
-    assert output.equals(expected_df)
-
-
 class TestTransformGenesBiodomains:
+    """Tests the genes_biodomains custom transform with 'perfect' input, input with missing data, and data that results
+    in all rows being dropped from the data frame (which causes a failure)."""
+
     data_files_path = "tests/test_assets/genes_biodomains"
     pass_test_data = [
         (  # pass with good data
@@ -90,8 +76,8 @@ class TestTransformGenesBiodomains:
         "input_file, expected_output_file", pass_test_data, ids=pass_test_ids
     )
     def test_transform_genes_biodomains_should_pass(
-        self, input_file, expected_output_file
-    ):
+        self, input_file: str, expected_output_file: str
+    ) -> None:
         input_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file))
         output_df = genes_biodomains.transform_genes_biodomains(
             datasets={"genes_biodomains": input_df}
@@ -102,7 +88,7 @@ def test_transform_genes_biodomains_should_pass(
         pd.testing.assert_frame_equal(output_df, expected_df)
 
     @pytest.mark.parametrize("input_file", fail_test_data, ids=fail_test_ids)
-    def test_transform_genes_biodomains_should_fail(self, input_file):
+    def test_transform_genes_biodomains_should_fail(self, input_file: str) -> None:
         with pytest.raises(
             ValueError, match="cannot insert ensembl_gene_id, already exists"
         ):

From 04014dd1f6676729eebaf6e38a5fff1a30669a1c Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Tue, 20 Feb 2024 19:00:27 -0800
Subject: [PATCH 3/7] Updated gene annotation pre-processing to handle new
 biodomains file, bumped version of gene_table_merged to the new file

---
 config.yaml                                   |    4 +-
 .../AG-896_Preprocess_Gene_Annotations.ipynb  | 1132 ++++++++++-------
 test_config.yaml                              |    4 +-
 3 files changed, 656 insertions(+), 484 deletions(-)

diff --git a/config.yaml b/config.yaml
index abce93f6..6a1393ef 100644
--- a/config.yaml
+++ b/config.yaml
@@ -135,7 +135,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.10
+          id: syn25953363.11
           format: feather
         - name: igap
           id: syn12514826.5
@@ -178,7 +178,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.10
+        - syn25953363.11
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance
diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
index 22446910..23240589 100644
--- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
+++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb
@@ -103,14 +103,6 @@
     "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:rpy2.rinterface_lib.callbacks:R[write to console]: Ensembl site unresponsive, trying useast mirror\n",
-      "\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -217,7 +209,7 @@
     {
      "data": {
       "text/plain": [
-       "{'genes_biodomains': ('syn44151254.1', 'csv'),\n",
+       "{'genes_biodomains': ('syn44151254.4', 'csv'),\n",
        " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n",
        " 'proteomics': ('syn18689335.3', 'csv'),\n",
        " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n",
@@ -281,22 +273,6 @@
     "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "UPGRADE AVAILABLE\n",
-      "\n",
-      "A more recent version of the Synapse Client (4.0.0) is available. Your version (3.1.1) can be upgraded by typing:\n",
-      "    pip install --upgrade synapseclient\n",
-      "\n",
-      "Python Synapse Client version 4.0.0 release notes\n",
-      "\n",
-      "https://python-docs.synapse.org/news/\n",
-      "\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -317,7 +293,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "genes_biodomains has an NaN Ensembl ID\n",
       "target_exp_validation_harmonized has an n/A Ensembl ID\n",
       "WARNING: no Ensembl ID column found for team_info!\n",
       "WARNING: no Ensembl ID column found for team_member_info!\n"
@@ -325,7 +300,8 @@
     }
    ],
    "source": [
-    "syn = utils._login_to_synapse(token=None)  # Assumes you have already logged in with a valid token\n",
+    "# Assumes you have already logged in with a valid token\n",
+    "syn = utils._login_to_synapse(token=None)\n",
     "\n",
     "# The various column names used to store Ensembl IDs in the files\n",
     "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n",
@@ -346,6 +322,14 @@
     "            df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n",
     "        )[\"value\"]\n",
     "\n",
+    "    # genes_biodomains is a special case -- the ensembl_id field has some semicolon-separated lists in it\n",
+    "    if file == \"genes_biodomains\":\n",
+    "        df = df[[\"Biodomain\", \"ensembl_id\"]].drop_duplicates().dropna()\n",
+    "        df = utils.split_delimited_field_to_multiple_rows(\n",
+    "            df=df, split_field=\"ensembl_id\", delim=\";\"\n",
+    "        )\n",
+    "        file_ensembl_ids = df[\"ensembl_id\"].drop_duplicates()\n",
+    "\n",
     "    if file_ensembl_ids is not None:\n",
     "        file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n",
     "        if \"n/A\" in file_ensembl_ids.tolist():\n",
@@ -366,8 +350,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3627 genes from the data files are missing from Biomart results and will be added.\n",
-      "66814\n"
+      "1821 genes from the data files are missing from Biomart results and will be added.\n",
+      "65009\n"
      ]
     }
    ],
@@ -549,9 +533,7 @@
       "INFO:biothings.client:done.\n",
       "INFO:biothings.client:querying 64001-65000...\n",
       "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 65001-66000...\n",
-      "INFO:biothings.client:done.\n",
-      "INFO:biothings.client:querying 66001-66814...\n",
+      "INFO:biothings.client:querying 65001-65009...\n",
       "INFO:biothings.client:done.\n"
      ]
     },
@@ -705,8 +687,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Annotations found for 66166 genes.\n",
-      "No annotations found for 1230 genes.\n"
+      "Annotations found for 63844 genes.\n",
+      "No annotations found for 1176 genes.\n"
      ]
     }
    ],
@@ -743,7 +725,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(67398, 11)\n"
+      "(65022, 11)\n"
      ]
     },
     {
@@ -927,7 +909,6 @@
     "    lambda cell: cell if isinstance(cell, list) else [cell]\n",
     ")\n",
     "\n",
-    "\n",
     "# Some alias values are lists of lists or have duplicate values\n",
     "def flatten(row):\n",
     "    flattened = []\n",
@@ -1068,92 +1049,245 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
+       "      <th>5133</th>\n",
+       "      <td>ENSG00000230417</td>\n",
+       "      <td>10</td>\n",
+       "      <td>LINC00595</td>\n",
+       "      <td>414243</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>long intergenic non-protein coding RNA 595</td>\n",
+       "      <td>LINC00595</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[C10orf101]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67263</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>8675</th>\n",
+       "      <td>ENSG00000188660</td>\n",
+       "      <td>21</td>\n",
+       "      <td>LINC00319</td>\n",
+       "      <td>124900467</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC124900467</td>\n",
+       "      <td>LOC124900467</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8676</th>\n",
+       "      <td>ENSG00000188660</td>\n",
+       "      <td>21</td>\n",
+       "      <td>LINC00319</td>\n",
+       "      <td>102724398</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized CH507-42P11.6</td>\n",
+       "      <td>CH507-42P11.6</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12016</th>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>21</td>\n",
        "      <td></td>\n",
+       "      <td>124905527</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC124905527</td>\n",
+       "      <td>LOC124905527</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906462</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906462</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12017</th>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>21</td>\n",
+       "      <td></td>\n",
+       "      <td>124905312</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC124905312</td>\n",
+       "      <td>LOC124905312</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12018</th>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>21</td>\n",
+       "      <td></td>\n",
+       "      <td>124905468</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC124905468</td>\n",
+       "      <td>LOC124905468</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43337</th>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>5</td>\n",
+       "      <td></td>\n",
+       "      <td>105377683</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC105377683</td>\n",
+       "      <td>LOC105377683</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43338</th>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>5</td>\n",
+       "      <td></td>\n",
+       "      <td>285626</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC285626</td>\n",
+       "      <td>LOC285626</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43519</th>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>15</td>\n",
+       "      <td>GOLGA6L5P</td>\n",
+       "      <td>100133220</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>golgin A6 family like 3, pseudogene</td>\n",
+       "      <td>GOLGA6L3P</td>\n",
+       "      <td>pseudo</td>\n",
+       "      <td>[GOLGA6L3]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43520</th>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>15</td>\n",
+       "      <td>GOLGA6L5P</td>\n",
+       "      <td>642402</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>golgin A6 family like 17, pseudogene</td>\n",
+       "      <td>GOLGA6L17P</td>\n",
        "      <td>pseudo</td>\n",
+       "      <td>[GOLGA6L21P]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>63071</th>\n",
+       "      <td>ENSG00000293331</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
+       "      <td>101928626</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>uncharacterized LOC101928626</td>\n",
+       "      <td>LOC101928626</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67264</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>63072</th>\n",
+       "      <td>ENSG00000293331</td>\n",
+       "      <td>1</td>\n",
        "      <td></td>\n",
+       "      <td>124901156</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>uncharacterized LOC124901156</td>\n",
+       "      <td>LOC124901156</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906463</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>63715</th>\n",
+       "      <td>ENSG00000276518</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>128966722</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906463</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966722</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67265</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>63716</th>\n",
+       "      <td>ENSG00000276518</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906464</td>\n",
+       "      <td>128966730</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906464</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966730</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67266</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>63717</th>\n",
+       "      <td>ENSG00000276518</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906465</td>\n",
+       "      <td>128966732</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906465</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966732</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67267</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>63718</th>\n",
+       "      <td>ENSG00000276518</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906466</td>\n",
+       "      <td>128966731</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906466</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966731</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>63719</th>\n",
+       "      <td>ENSG00000276518</td>\n",
+       "      <td></td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>128966733</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966733</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>[]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>614 rows × 11 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -1163,40 +1297,71 @@
        "5130   ENSG00000230417              10   LINC00856     414243       1.0   \n",
        "5131   ENSG00000230417              10   LINC00856     414243       1.0   \n",
        "5132   ENSG00000230417              10   LINC00595     414243       1.0   \n",
-       "...                ...             ...         ...        ...       ...   \n",
-       "67263  ENSG00000284262                         NaN  124906462       2.0   \n",
-       "67264  ENSG00000284262                         NaN  124906463       2.0   \n",
-       "67265  ENSG00000284262                         NaN  124906464       2.0   \n",
-       "67266  ENSG00000284262                         NaN  124906465       2.0   \n",
-       "67267  ENSG00000284262                         NaN  124906466       2.0   \n",
-       "\n",
-       "                                             name        symbol type_of_gene  \\\n",
-       "4089                 uncharacterized LOC101927042  LOC101927042        ncRNA   \n",
-       "4090                 uncharacterized LOC124902157  LOC124902157        ncRNA   \n",
-       "5130   long intergenic non-protein coding RNA 595     LINC00595        ncRNA   \n",
-       "5131   long intergenic non-protein coding RNA 595     LINC00595        ncRNA   \n",
-       "5132   long intergenic non-protein coding RNA 595     LINC00595        ncRNA   \n",
-       "...                                           ...           ...          ...   \n",
-       "67263                   double homeobox protein 4  LOC124906462       pseudo   \n",
-       "67264                   double homeobox protein 4  LOC124906463       pseudo   \n",
-       "67265                   double homeobox protein 4  LOC124906464       pseudo   \n",
-       "67266                   double homeobox protein 4  LOC124906465       pseudo   \n",
-       "67267                   double homeobox protein 4  LOC124906466       pseudo   \n",
+       "5133   ENSG00000230417              10   LINC00595     414243       1.0   \n",
+       "8675   ENSG00000188660              21   LINC00319  124900467       1.0   \n",
+       "8676   ENSG00000188660              21   LINC00319  102724398       1.0   \n",
+       "12016  ENSG00000278903              21              124905527       1.0   \n",
+       "12017  ENSG00000278903              21              124905312       1.0   \n",
+       "12018  ENSG00000278903              21              124905468       1.0   \n",
+       "43337  ENSG00000249738               5              105377683       1.0   \n",
+       "43338  ENSG00000249738               5                 285626       1.0   \n",
+       "43519  ENSG00000230373              15   GOLGA6L5P  100133220       1.0   \n",
+       "43520  ENSG00000230373              15   GOLGA6L5P     642402       1.0   \n",
+       "63071  ENSG00000293331               1              101928626       2.0   \n",
+       "63072  ENSG00000293331               1              124901156       2.0   \n",
+       "63715  ENSG00000276518                         NaN  128966722       2.0   \n",
+       "63716  ENSG00000276518                         NaN  128966730       2.0   \n",
+       "63717  ENSG00000276518                         NaN  128966732       2.0   \n",
+       "63718  ENSG00000276518                         NaN  128966731       2.0   \n",
+       "63719  ENSG00000276518                         NaN  128966733       2.0   \n",
        "\n",
-       "             alias summary notfound  \n",
-       "4089            []     NaN      NaN  \n",
-       "4090            []     NaN      NaN  \n",
-       "5130   [C10orf101]     NaN      NaN  \n",
-       "5131   [C10orf101]     NaN      NaN  \n",
-       "5132   [C10orf101]     NaN      NaN  \n",
-       "...            ...     ...      ...  \n",
-       "67263           []     NaN      NaN  \n",
-       "67264           []     NaN      NaN  \n",
-       "67265           []     NaN      NaN  \n",
-       "67266           []     NaN      NaN  \n",
-       "67267           []     NaN      NaN  \n",
+       "                                                    name         symbol  \\\n",
+       "4089                        uncharacterized LOC101927042   LOC101927042   \n",
+       "4090                        uncharacterized LOC124902157   LOC124902157   \n",
+       "5130          long intergenic non-protein coding RNA 595      LINC00595   \n",
+       "5131          long intergenic non-protein coding RNA 595      LINC00595   \n",
+       "5132          long intergenic non-protein coding RNA 595      LINC00595   \n",
+       "5133          long intergenic non-protein coding RNA 595      LINC00595   \n",
+       "8675                        uncharacterized LOC124900467   LOC124900467   \n",
+       "8676                       uncharacterized CH507-42P11.6  CH507-42P11.6   \n",
+       "12016                       uncharacterized LOC124905527   LOC124905527   \n",
+       "12017                       uncharacterized LOC124905312   LOC124905312   \n",
+       "12018                       uncharacterized LOC124905468   LOC124905468   \n",
+       "43337                       uncharacterized LOC105377683   LOC105377683   \n",
+       "43338                          uncharacterized LOC285626      LOC285626   \n",
+       "43519                golgin A6 family like 3, pseudogene      GOLGA6L3P   \n",
+       "43520               golgin A6 family like 17, pseudogene     GOLGA6L17P   \n",
+       "63071                       uncharacterized LOC101928626   LOC101928626   \n",
+       "63072                       uncharacterized LOC124901156   LOC124901156   \n",
+       "63715  putative killer cell immunoglobulin-like recep...   LOC128966722   \n",
+       "63716  putative killer cell immunoglobulin-like recep...   LOC128966730   \n",
+       "63717  putative killer cell immunoglobulin-like recep...   LOC128966732   \n",
+       "63718  putative killer cell immunoglobulin-like recep...   LOC128966731   \n",
+       "63719  putative killer cell immunoglobulin-like recep...   LOC128966733   \n",
        "\n",
-       "[614 rows x 11 columns]"
+       "         type_of_gene         alias summary notfound  \n",
+       "4089            ncRNA            []     NaN      NaN  \n",
+       "4090            ncRNA            []     NaN      NaN  \n",
+       "5130            ncRNA   [C10orf101]     NaN      NaN  \n",
+       "5131            ncRNA   [C10orf101]     NaN      NaN  \n",
+       "5132            ncRNA   [C10orf101]     NaN      NaN  \n",
+       "5133            ncRNA   [C10orf101]     NaN      NaN  \n",
+       "8675   protein-coding            []     NaN      NaN  \n",
+       "8676            ncRNA            []     NaN      NaN  \n",
+       "12016           ncRNA            []     NaN      NaN  \n",
+       "12017           ncRNA            []     NaN      NaN  \n",
+       "12018           ncRNA            []     NaN      NaN  \n",
+       "43337           ncRNA            []     NaN      NaN  \n",
+       "43338           ncRNA            []     NaN      NaN  \n",
+       "43519          pseudo    [GOLGA6L3]     NaN      NaN  \n",
+       "43520          pseudo  [GOLGA6L21P]     NaN      NaN  \n",
+       "63071           ncRNA            []     NaN      NaN  \n",
+       "63072           ncRNA            []     NaN      NaN  \n",
+       "63715  protein-coding            []     NaN      NaN  \n",
+       "63716  protein-coding            []     NaN      NaN  \n",
+       "63717  protein-coding            []     NaN      NaN  \n",
+       "63718  protein-coding            []     NaN      NaN  \n",
+       "63719  protein-coding            []     NaN      NaN  "
       ]
      },
      "execution_count": 12,
@@ -1226,7 +1391,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "29 duplicated genes have been processed.\n"
+      "8 duplicated genes have been processed.\n"
      ]
     },
     {
@@ -1265,49 +1430,35 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>66803</th>\n",
-       "      <td>ENSG00000284181</td>\n",
+       "      <th>64998</th>\n",
+       "      <td>ENSG00000277936</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124905410</td>\n",
+       "      <td>84311</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124905410</td>\n",
+       "      <td>mitochondrial ribosomal protein L45</td>\n",
+       "      <td>MRPL45</td>\n",
        "      <td>protein-coding</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>[MRP-L45, L45mt, Mba1, mL45]</td>\n",
+       "      <td>Mammalian mitochondrial ribosomal proteins are...</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66804</th>\n",
-       "      <td>ENSG00000284262</td>\n",
+       "      <th>64999</th>\n",
+       "      <td>ENSG00000277328</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>66805</th>\n",
-       "      <td>ENSG00000284496</td>\n",
-       "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66806</th>\n",
+       "      <th>65000</th>\n",
        "      <td>ENSG00000287838</td>\n",
        "      <td>9</td>\n",
        "      <td></td>\n",
@@ -1321,86 +1472,100 @@
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66807</th>\n",
-       "      <td>ENSG00000284383</td>\n",
+       "      <th>65001</th>\n",
+       "      <td>ENSG00000249738</td>\n",
+       "      <td>5</td>\n",
        "      <td></td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <td>105377683</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC105377683</td>\n",
+       "      <td>LOC105377683</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[LOC285626]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66808</th>\n",
-       "      <td>ENSG00000283767</td>\n",
+       "      <th>65002</th>\n",
+       "      <td>ENSG00000293331</td>\n",
+       "      <td>1</td>\n",
        "      <td></td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
+       "      <td>101928626</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <td>uncharacterized LOC101928626</td>\n",
+       "      <td>LOC101928626</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[LOC124901156]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66809</th>\n",
-       "      <td>ENSG00000283884</td>\n",
+       "      <th>65003</th>\n",
+       "      <td>ENSG00000276518</td>\n",
        "      <td></td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
+       "      <td>128966722</td>\n",
        "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>LOC128966722</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>[LOC128966731, LOC128966733, LOC128966730, LOC...</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66810</th>\n",
-       "      <td>ENSG00000277660</td>\n",
-       "      <td></td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>124904108</td>\n",
+       "      <th>65004</th>\n",
+       "      <td>ENSG00000230417</td>\n",
+       "      <td>10</td>\n",
+       "      <td>LINC00595</td>\n",
+       "      <td>414243</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>U6 spliceosomal RNA</td>\n",
-       "      <td>LOC124904108</td>\n",
-       "      <td>snRNA</td>\n",
-       "      <td>[LOC124906683]</td>\n",
+       "      <td>long intergenic non-protein coding RNA 595</td>\n",
+       "      <td>LINC00595</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[LINC00595, C10orf101]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66811</th>\n",
-       "      <td>ENSG00000283955</td>\n",
+       "      <th>65005</th>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>21</td>\n",
        "      <td></td>\n",
+       "      <td>124905527</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>uncharacterized LOC124905527</td>\n",
+       "      <td>LOC124905527</td>\n",
+       "      <td>ncRNA</td>\n",
+       "      <td>[LOC124905468, LOC124905312]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>124906452</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>LOC124906452</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>65006</th>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>15</td>\n",
+       "      <td>GOLGA6L5P</td>\n",
+       "      <td>100133220</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>golgin A6 family like 3, pseudogene</td>\n",
+       "      <td>GOLGA6L3P</td>\n",
        "      <td>pseudo</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <td>[GOLGA6L17P, GOLGA6L21P, GOLGA6L3]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66812</th>\n",
-       "      <td>ENSG00000275405</td>\n",
-       "      <td></td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>124905321</td>\n",
+       "      <th>65007</th>\n",
+       "      <td>ENSG00000188660</td>\n",
+       "      <td>21</td>\n",
+       "      <td>LINC00319</td>\n",
+       "      <td>124900467</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>U1 spliceosomal RNA</td>\n",
-       "      <td>LOC124905321</td>\n",
-       "      <td>snRNA</td>\n",
-       "      <td>[LOC124904613, LOC124905809, LOC124905573]</td>\n",
+       "      <td>uncharacterized LOC124900467</td>\n",
+       "      <td>LOC124900467</td>\n",
+       "      <td>protein-coding</td>\n",
+       "      <td>[CH507-42P11.6]</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
@@ -1409,41 +1574,53 @@
        "</div>"
       ],
       "text/plain": [
-       "       ensembl_gene_id chromosome_name hgnc_symbol        _id  _version  \\\n",
-       "66803  ENSG00000284181                         NaN  124905410       1.0   \n",
-       "66804  ENSG00000284262                         NaN  124906452       2.0   \n",
-       "66805  ENSG00000284496                         NaN  124906452       2.0   \n",
-       "66806  ENSG00000287838               9              101927042       1.0   \n",
-       "66807  ENSG00000284383                         NaN  124906452       2.0   \n",
-       "66808  ENSG00000283767                         NaN  124906452       2.0   \n",
-       "66809  ENSG00000283884                         NaN  124906452       2.0   \n",
-       "66810  ENSG00000277660                         NaN  124904108       1.0   \n",
-       "66811  ENSG00000283955                         NaN  124906452       2.0   \n",
-       "66812  ENSG00000275405                         NaN  124905321       1.0   \n",
+       "       ensembl_gene_id chromosome_name hgnc_symbol        _id _version  \\\n",
+       "64998  ENSG00000277936                         NaN      84311      1.0   \n",
+       "64999  ENSG00000277328                         NaN        NaN      NaN   \n",
+       "65000  ENSG00000287838               9              101927042      1.0   \n",
+       "65001  ENSG00000249738               5              105377683      1.0   \n",
+       "65002  ENSG00000293331               1              101928626      2.0   \n",
+       "65003  ENSG00000276518                         NaN  128966722      2.0   \n",
+       "65004  ENSG00000230417              10   LINC00595     414243      1.0   \n",
+       "65005  ENSG00000278903              21              124905527      1.0   \n",
+       "65006  ENSG00000230373              15   GOLGA6L5P  100133220      1.0   \n",
+       "65007  ENSG00000188660              21   LINC00319  124900467      1.0   \n",
        "\n",
-       "                               name        symbol    type_of_gene  \\\n",
-       "66803     double homeobox protein 4  LOC124905410  protein-coding   \n",
-       "66804     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66805     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66806  uncharacterized LOC101927042  LOC101927042           ncRNA   \n",
-       "66807     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66808     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66809     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66810           U6 spliceosomal RNA  LOC124904108           snRNA   \n",
-       "66811     double homeobox protein 4  LOC124906452          pseudo   \n",
-       "66812           U1 spliceosomal RNA  LOC124905321           snRNA   \n",
+       "                                                    name        symbol  \\\n",
+       "64998                mitochondrial ribosomal protein L45        MRPL45   \n",
+       "64999                                                NaN           NaN   \n",
+       "65000                       uncharacterized LOC101927042  LOC101927042   \n",
+       "65001                       uncharacterized LOC105377683  LOC105377683   \n",
+       "65002                       uncharacterized LOC101928626  LOC101928626   \n",
+       "65003  putative killer cell immunoglobulin-like recep...  LOC128966722   \n",
+       "65004         long intergenic non-protein coding RNA 595     LINC00595   \n",
+       "65005                       uncharacterized LOC124905527  LOC124905527   \n",
+       "65006                golgin A6 family like 3, pseudogene     GOLGA6L3P   \n",
+       "65007                       uncharacterized LOC124900467  LOC124900467   \n",
        "\n",
-       "                                                   alias summary notfound  \n",
-       "66803  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66804  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66805  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66806                                     [LOC124902157]     NaN      NaN  \n",
-       "66807  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66808  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66809  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66810                                     [LOC124906683]     NaN      NaN  \n",
-       "66811  [LOC124906461, LOC124906459, LOC124906465, LOC...     NaN      NaN  \n",
-       "66812         [LOC124904613, LOC124905809, LOC124905573]     NaN      NaN  "
+       "         type_of_gene                                              alias  \\\n",
+       "64998  protein-coding                       [MRP-L45, L45mt, Mba1, mL45]   \n",
+       "64999             NaN                                                 []   \n",
+       "65000           ncRNA                                     [LOC124902157]   \n",
+       "65001           ncRNA                                        [LOC285626]   \n",
+       "65002           ncRNA                                     [LOC124901156]   \n",
+       "65003  protein-coding  [LOC128966731, LOC128966733, LOC128966730, LOC...   \n",
+       "65004           ncRNA                             [LINC00595, C10orf101]   \n",
+       "65005           ncRNA                       [LOC124905468, LOC124905312]   \n",
+       "65006          pseudo                 [GOLGA6L17P, GOLGA6L21P, GOLGA6L3]   \n",
+       "65007  protein-coding                                    [CH507-42P11.6]   \n",
+       "\n",
+       "                                                 summary notfound  \n",
+       "64998  Mammalian mitochondrial ribosomal proteins are...      NaN  \n",
+       "64999                                                NaN     True  \n",
+       "65000                                                NaN      NaN  \n",
+       "65001                                                NaN      NaN  \n",
+       "65002                                                NaN      NaN  \n",
+       "65003                                                NaN      NaN  \n",
+       "65004                                                NaN      NaN  \n",
+       "65005                                                NaN      NaN  \n",
+       "65006                                                NaN      NaN  \n",
+       "65007                                                NaN      NaN  "
       ]
      },
      "execution_count": 13,
@@ -1453,7 +1630,7 @@
    ],
    "source": [
     "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n",
-    "keep_df = gene_table_merged.loc[non_dupes].copy(deep=True)\n",
+    "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n",
     "\n",
     "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n",
     "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n",
@@ -1483,7 +1660,7 @@
     "        group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n",
     "\n",
     "        # Keep the first row only, which now has all the aliases\n",
-    "        keep_df = keep_df.append(group.iloc[0], ignore_index=True)\n",
+    "        keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n",
     "\n",
     "print(\n",
     "    str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n",
@@ -1596,7 +1773,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "66813\n",
+      "65008\n",
       "Querying genes 1 - 1000\n",
       "Querying genes 1001 - 2000\n",
       "Querying genes 2001 - 3000\n",
@@ -1662,9 +1839,8 @@
       "Querying genes 62001 - 63000\n",
       "Querying genes 63001 - 64000\n",
       "Querying genes 64001 - 65000\n",
-      "Querying genes 65001 - 66000\n",
-      "Querying genes 66001 - 66813\n",
-      "66806\n"
+      "Querying genes 65001 - 65008\n",
+      "65008\n"
      ]
     },
     {
@@ -1688,96 +1864,96 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>release</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>version</th>\n",
        "      <th>assembly</th>\n",
+       "      <th>peptide</th>\n",
+       "      <th>release</th>\n",
        "      <th>latest</th>\n",
+       "      <th>possible_replacement</th>\n",
+       "      <th>version</th>\n",
        "      <th>id</th>\n",
+       "      <th>type</th>\n",
        "      <th>is_current</th>\n",
-       "      <th>possible_replacement</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>66801</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
+       "      <th>65003</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000283767.2</td>\n",
-       "      <td>ENSG00000283767</td>\n",
-       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
+       "      <td>ENSG00000276518.1</td>\n",
        "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>66802</th>\n",
+       "      <td>1</td>\n",
+       "      <td>ENSG00000276518</td>\n",
        "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000283884.2</td>\n",
-       "      <td>ENSG00000283884</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66803</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>1</td>\n",
+       "      <th>65004</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000277660.1</td>\n",
-       "      <td>ENSG00000277660</td>\n",
-       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
+       "      <td>ENSG00000230417.12</td>\n",
        "      <td>[]</td>\n",
+       "      <td>12</td>\n",
+       "      <td>ENSG00000230417</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66804</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
+       "      <th>65005</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000283955.2</td>\n",
-       "      <td>ENSG00000283955</td>\n",
-       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
+       "      <td>ENSG00000278903.5</td>\n",
        "      <td>[]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66805</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
+       "      <th>65006</th>\n",
+       "      <td>GRCh38</td>\n",
        "      <td>None</td>\n",
+       "      <td>111</td>\n",
+       "      <td>ENSG00000230373.9</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>9</td>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>65007</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000275405.1</td>\n",
-       "      <td>ENSG00000275405</td>\n",
-       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
+       "      <td>ENSG00000188660.5</td>\n",
        "      <td>[]</td>\n",
+       "      <td>5</td>\n",
+       "      <td>ENSG00000188660</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "       type release peptide  version assembly             latest  \\\n",
-       "66801  Gene     111    None        2   GRCh38  ENSG00000283767.2   \n",
-       "66802  Gene     111    None        2   GRCh38  ENSG00000283884.2   \n",
-       "66803  Gene     111    None        1   GRCh38  ENSG00000277660.1   \n",
-       "66804  Gene     111    None        2   GRCh38  ENSG00000283955.2   \n",
-       "66805  Gene     111    None        1   GRCh38  ENSG00000275405.1   \n",
+       "      assembly peptide release              latest possible_replacement  \\\n",
+       "65003   GRCh38    None     111   ENSG00000276518.1                   []   \n",
+       "65004   GRCh38    None     111  ENSG00000230417.12                   []   \n",
+       "65005   GRCh38    None     111   ENSG00000278903.5                   []   \n",
+       "65006   GRCh38    None     111   ENSG00000230373.9                   []   \n",
+       "65007   GRCh38    None     111   ENSG00000188660.5                   []   \n",
        "\n",
-       "                    id is_current possible_replacement  \n",
-       "66801  ENSG00000283767          1                   []  \n",
-       "66802  ENSG00000283884          1                   []  \n",
-       "66803  ENSG00000277660          1                   []  \n",
-       "66804  ENSG00000283955          1                   []  \n",
-       "66805  ENSG00000275405          1                   []  "
+       "       version               id  type is_current  \n",
+       "65003        1  ENSG00000276518  Gene          1  \n",
+       "65004       12  ENSG00000230417  Gene          1  \n",
+       "65005        5  ENSG00000278903  Gene          1  \n",
+       "65006        9  ENSG00000230373  Gene          1  \n",
+       "65007        5  ENSG00000188660  Gene          1  "
       ]
      },
      "execution_count": 15,
@@ -1847,31 +2023,29 @@
        "release\n",
        "100       22\n",
        "101        8\n",
-       "102       24\n",
+       "102       16\n",
        "103       15\n",
        "104       19\n",
        "105        9\n",
        "106       34\n",
        "107       10\n",
        "108        4\n",
-       "109        5\n",
+       "109        4\n",
        "110       11\n",
-       "111    65595\n",
+       "111    63843\n",
        "80        21\n",
        "81         2\n",
        "82        10\n",
        "84       673\n",
-       "85         1\n",
        "87        61\n",
        "89        20\n",
        "91        75\n",
-       "92         3\n",
        "93        53\n",
        "95        33\n",
-       "96        36\n",
+       "96        31\n",
        "97        18\n",
        "98         9\n",
-       "99        35\n",
+       "99         7\n",
        "dtype: int64"
       ]
      },
@@ -1896,16 +2070,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "66806\n",
-      "66813\n",
-      "False\n"
+      "65008\n",
+      "65008\n",
+      "True\n"
      ]
     }
    ],
    "source": [
     "# Check that all IDs are the same between the result and the gene table\n",
-    "# NOTE: After addition of biodomains, this is no longer true. There are 7 genes from the biodomains\n",
-    "# dataset that are not in the Ensembl database or archives. \n",
     "print(len(versions[\"id\"]))\n",
     "print(len(gene_table_merged))\n",
     "print(\n",
@@ -1948,7 +2120,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "0d5b5652",
    "metadata": {
     "scrolled": true
@@ -1980,24 +2152,24 @@
      "data": {
       "text/plain": [
        "closest_release\n",
-       "80       919\n",
+       "80       915\n",
        "95        33\n",
-       "96        36\n",
+       "96        31\n",
        "97        18\n",
        "98         9\n",
-       "99        35\n",
+       "99         7\n",
        "100       22\n",
        "101        8\n",
-       "102       24\n",
+       "102       16\n",
        "103       15\n",
        "104       19\n",
        "105        9\n",
        "106       34\n",
        "107       10\n",
        "108        4\n",
-       "109        5\n",
+       "109        4\n",
        "110       11\n",
-       "111    65595\n",
+       "111    63843\n",
        "dtype: int64"
       ]
      },
@@ -2049,15 +2221,15 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>release</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>version</th>\n",
        "      <th>assembly</th>\n",
+       "      <th>peptide</th>\n",
+       "      <th>release</th>\n",
        "      <th>latest</th>\n",
+       "      <th>possible_replacement</th>\n",
+       "      <th>version</th>\n",
        "      <th>id</th>\n",
+       "      <th>type</th>\n",
        "      <th>is_current</th>\n",
-       "      <th>possible_replacement</th>\n",
        "      <th>closest_release</th>\n",
        "      <th>permalink</th>\n",
        "    </tr>\n",
@@ -2065,71 +2237,71 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
        "      <td>ENSG00000210049.1</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>1</td>\n",
        "      <td>ENSG00000210049</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "      <td>111</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
        "      <td>GRCh38</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
        "      <td>ENSG00000211459.2</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>2</td>\n",
        "      <td>ENSG00000211459</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "      <td>111</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
        "      <td>ENSG00000210077.1</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>1</td>\n",
        "      <td>ENSG00000210077</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "      <td>111</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
        "      <td>GRCh38</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
        "      <td>ENSG00000210082.2</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>2</td>\n",
        "      <td>ENSG00000210082</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "      <td>111</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>111</td>\n",
-       "      <td>None</td>\n",
-       "      <td>1</td>\n",
        "      <td>GRCh38</td>\n",
+       "      <td>None</td>\n",
+       "      <td>111</td>\n",
        "      <td>ENSG00000209082.1</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>1</td>\n",
        "      <td>ENSG00000209082</td>\n",
+       "      <td>Gene</td>\n",
        "      <td>1</td>\n",
-       "      <td>[]</td>\n",
        "      <td>111</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
@@ -2138,19 +2310,19 @@
        "</div>"
       ],
       "text/plain": [
-       "   type release peptide  version assembly             latest               id  \\\n",
-       "0  Gene     111    None        1   GRCh38  ENSG00000210049.1  ENSG00000210049   \n",
-       "1  Gene     111    None        2   GRCh38  ENSG00000211459.2  ENSG00000211459   \n",
-       "2  Gene     111    None        1   GRCh38  ENSG00000210077.1  ENSG00000210077   \n",
-       "3  Gene     111    None        2   GRCh38  ENSG00000210082.2  ENSG00000210082   \n",
-       "4  Gene     111    None        1   GRCh38  ENSG00000209082.1  ENSG00000209082   \n",
+       "  assembly peptide release             latest possible_replacement  version  \\\n",
+       "0   GRCh38    None     111  ENSG00000210049.1                   []        1   \n",
+       "1   GRCh38    None     111  ENSG00000211459.2                   []        2   \n",
+       "2   GRCh38    None     111  ENSG00000210077.1                   []        1   \n",
+       "3   GRCh38    None     111  ENSG00000210082.2                   []        2   \n",
+       "4   GRCh38    None     111  ENSG00000209082.1                   []        1   \n",
        "\n",
-       "  is_current possible_replacement  closest_release  \\\n",
-       "0          1                   []              111   \n",
-       "1          1                   []              111   \n",
-       "2          1                   []              111   \n",
-       "3          1                   []              111   \n",
-       "4          1                   []              111   \n",
+       "                id  type is_current  closest_release  \\\n",
+       "0  ENSG00000210049  Gene          1              111   \n",
+       "1  ENSG00000211459  Gene          1              111   \n",
+       "2  ENSG00000210077  Gene          1              111   \n",
+       "3  ENSG00000210082  Gene          1              111   \n",
+       "4  ENSG00000209082  Gene          1              111   \n",
        "\n",
        "                                           permalink  \n",
        "0  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
@@ -2206,87 +2378,87 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>release</th>\n",
-       "      <th>peptide</th>\n",
-       "      <th>version</th>\n",
        "      <th>assembly</th>\n",
+       "      <th>peptide</th>\n",
+       "      <th>release</th>\n",
        "      <th>latest</th>\n",
+       "      <th>possible_replacement</th>\n",
+       "      <th>version</th>\n",
        "      <th>id</th>\n",
+       "      <th>type</th>\n",
        "      <th>is_current</th>\n",
-       "      <th>possible_replacement</th>\n",
        "      <th>closest_release</th>\n",
        "      <th>permalink</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>63184</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>93</td>\n",
+       "      <th>63180</th>\n",
+       "      <td>GRCh38</td>\n",
        "      <td>None</td>\n",
+       "      <td>84</td>\n",
+       "      <td>ENSG00000238909.1</td>\n",
+       "      <td>[]</td>\n",
        "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000260977.1</td>\n",
-       "      <td>ENSG00000260977</td>\n",
+       "      <td>ENSG00000238909</td>\n",
+       "      <td>Gene</td>\n",
        "      <td></td>\n",
-       "      <td>[]</td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>63187</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>84</td>\n",
+       "      <th>63181</th>\n",
+       "      <td>GRCh38</td>\n",
        "      <td>None</td>\n",
+       "      <td>84</td>\n",
+       "      <td>ENSG00000265155.1</td>\n",
+       "      <td>[]</td>\n",
        "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000280788.1</td>\n",
-       "      <td>ENSG00000280788</td>\n",
+       "      <td>ENSG00000265155</td>\n",
+       "      <td>Gene</td>\n",
        "      <td></td>\n",
-       "      <td>[]</td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>63190</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>84</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
+       "      <th>63183</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000222197.2</td>\n",
-       "      <td>ENSG00000222197</td>\n",
-       "      <td></td>\n",
+       "      <td>None</td>\n",
+       "      <td>84</td>\n",
+       "      <td>ENSG00000275447.1</td>\n",
        "      <td>[]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ENSG00000275447</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>63192</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>84</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2</td>\n",
+       "      <th>63184</th>\n",
        "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000265212.2</td>\n",
-       "      <td>ENSG00000265212</td>\n",
-       "      <td></td>\n",
+       "      <td>None</td>\n",
+       "      <td>84</td>\n",
+       "      <td>ENSG00000263623.1</td>\n",
        "      <td>[]</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ENSG00000263623</td>\n",
+       "      <td>Gene</td>\n",
+       "      <td></td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>63200</th>\n",
-       "      <td>Gene</td>\n",
-       "      <td>87</td>\n",
+       "      <th>63190</th>\n",
+       "      <td>GRCh38</td>\n",
        "      <td>None</td>\n",
+       "      <td>84</td>\n",
+       "      <td>ENSG00000238644.1</td>\n",
+       "      <td>[]</td>\n",
        "      <td>1</td>\n",
-       "      <td>GRCh38</td>\n",
-       "      <td>ENSG00000279921.1</td>\n",
-       "      <td>ENSG00000279921</td>\n",
+       "      <td>ENSG00000238644</td>\n",
+       "      <td>Gene</td>\n",
        "      <td></td>\n",
-       "      <td>[]</td>\n",
        "      <td>80</td>\n",
        "      <td>https://may2015.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
@@ -2295,26 +2467,26 @@
        "</div>"
       ],
       "text/plain": [
-       "       type release peptide  version assembly             latest  \\\n",
-       "63184  Gene      93    None        1   GRCh38  ENSG00000260977.1   \n",
-       "63187  Gene      84    None        1   GRCh38  ENSG00000280788.1   \n",
-       "63190  Gene      84    None        2   GRCh38  ENSG00000222197.2   \n",
-       "63192  Gene      84    None        2   GRCh38  ENSG00000265212.2   \n",
-       "63200  Gene      87    None        1   GRCh38  ENSG00000279921.1   \n",
+       "      assembly peptide release             latest possible_replacement  \\\n",
+       "63180   GRCh38    None      84  ENSG00000238909.1                   []   \n",
+       "63181   GRCh38    None      84  ENSG00000265155.1                   []   \n",
+       "63183   GRCh38    None      84  ENSG00000275447.1                   []   \n",
+       "63184   GRCh38    None      84  ENSG00000263623.1                   []   \n",
+       "63190   GRCh38    None      84  ENSG00000238644.1                   []   \n",
        "\n",
-       "                    id is_current possible_replacement  closest_release  \\\n",
-       "63184  ENSG00000260977                              []               80   \n",
-       "63187  ENSG00000280788                              []               80   \n",
-       "63190  ENSG00000222197                              []               80   \n",
-       "63192  ENSG00000265212                              []               80   \n",
-       "63200  ENSG00000279921                              []               80   \n",
+       "       version               id  type is_current  closest_release  \\\n",
+       "63180        1  ENSG00000238909  Gene                          80   \n",
+       "63181        1  ENSG00000265155  Gene                          80   \n",
+       "63183        1  ENSG00000275447  Gene                          80   \n",
+       "63184        1  ENSG00000263623  Gene                          80   \n",
+       "63190        1  ENSG00000238644  Gene                          80   \n",
        "\n",
        "                                               permalink  \n",
+       "63180  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "63181  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
+       "63183  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
        "63184  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "63187  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "63190  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "63192  https://may2015.archive.ensembl.org/Homo_sapie...  \n",
-       "63200  https://may2015.archive.ensembl.org/Homo_sapie...  "
+       "63190  https://may2015.archive.ensembl.org/Homo_sapie...  "
       ]
      },
      "execution_count": 22,
@@ -2387,7 +2559,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(66813, 14)\n"
+      "(65008, 14)\n"
      ]
     },
     {
@@ -2518,12 +2690,12 @@
        "</div>"
       ],
       "text/plain": [
-       "   ensembl_gene_id chromosome_name hgnc_symbol   _id  _version      name  \\\n",
-       "0  ENSG00000210049              MT       MT-TF  4558       2.0  tRNA-Phe   \n",
-       "1  ENSG00000211459              MT     MT-RNR1  4549       2.0    s-rRNA   \n",
-       "2  ENSG00000210077              MT       MT-TV  4577       2.0  tRNA-Val   \n",
-       "3  ENSG00000210082              MT     MT-RNR2  4550       2.0    l-rRNA   \n",
-       "4  ENSG00000209082              MT      MT-TL1  4567       2.0  tRNA-Leu   \n",
+       "   ensembl_gene_id chromosome_name hgnc_symbol   _id _version      name  \\\n",
+       "0  ENSG00000210049              MT       MT-TF  4558      2.0  tRNA-Phe   \n",
+       "1  ENSG00000211459              MT     MT-RNR1  4549      2.0    s-rRNA   \n",
+       "2  ENSG00000210077              MT       MT-TV  4577      2.0  tRNA-Val   \n",
+       "3  ENSG00000210082              MT     MT-RNR2  4550      2.0    l-rRNA   \n",
+       "4  ENSG00000209082              MT      MT-TL1  4567      2.0  tRNA-Leu   \n",
        "\n",
        "  symbol type_of_gene     alias  \\\n",
        "0   TRNF         tRNA        []   \n",
@@ -2695,83 +2867,83 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66808</th>\n",
-       "      <td>ENSG00000283767</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <th>65003</th>\n",
+       "      <td>ENSG00000276518</td>\n",
+       "      <td>putative killer cell immunoglobulin-like recep...</td>\n",
+       "      <td>[LOC128966731, LOC128966733, LOC128966730, LOC...</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>LOC128966722</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>111</td>\n",
        "      <td>[]</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66809</th>\n",
-       "      <td>ENSG00000283884</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <th>65004</th>\n",
+       "      <td>ENSG00000230417</td>\n",
+       "      <td>long intergenic non-protein coding RNA 595</td>\n",
+       "      <td>[LINC00595, C10orf101]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC124906452</td>\n",
-       "      <td>pseudo</td>\n",
+       "      <td>LINC00595</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>111</td>\n",
        "      <td>[]</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66810</th>\n",
-       "      <td>ENSG00000277660</td>\n",
-       "      <td>U6 spliceosomal RNA</td>\n",
-       "      <td>[LOC124906683]</td>\n",
+       "      <th>65005</th>\n",
+       "      <td>ENSG00000278903</td>\n",
+       "      <td>uncharacterized LOC124905527</td>\n",
+       "      <td>[LOC124905468, LOC124905312]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC124904108</td>\n",
-       "      <td>snRNA</td>\n",
+       "      <td>LOC124905527</td>\n",
+       "      <td>ncRNA</td>\n",
        "      <td>111</td>\n",
        "      <td>[]</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66811</th>\n",
-       "      <td>ENSG00000283955</td>\n",
-       "      <td>double homeobox protein 4</td>\n",
-       "      <td>[LOC124906461, LOC124906459, LOC124906465, LOC...</td>\n",
+       "      <th>65006</th>\n",
+       "      <td>ENSG00000230373</td>\n",
+       "      <td>golgin A6 family like 3, pseudogene</td>\n",
+       "      <td>[GOLGA6L17P, GOLGA6L21P, GOLGA6L3]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC124906452</td>\n",
+       "      <td>GOLGA6L3P</td>\n",
        "      <td>pseudo</td>\n",
        "      <td>111</td>\n",
        "      <td>[]</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>66812</th>\n",
-       "      <td>ENSG00000275405</td>\n",
-       "      <td>U1 spliceosomal RNA</td>\n",
-       "      <td>[LOC124904613, LOC124905809, LOC124905573]</td>\n",
+       "      <th>65007</th>\n",
+       "      <td>ENSG00000188660</td>\n",
+       "      <td>uncharacterized LOC124900467</td>\n",
+       "      <td>[CH507-42P11.6]</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>LOC124905321</td>\n",
-       "      <td>snRNA</td>\n",
+       "      <td>LOC124900467</td>\n",
+       "      <td>protein-coding</td>\n",
        "      <td>111</td>\n",
        "      <td>[]</td>\n",
        "      <td>https://jan2024.archive.ensembl.org/Homo_sapie...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>66813 rows × 9 columns</p>\n",
+       "<p>65008 rows × 9 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       ensembl_gene_id                       name  \\\n",
-       "0      ENSG00000210049                   tRNA-Phe   \n",
-       "1      ENSG00000211459                     s-rRNA   \n",
-       "2      ENSG00000210077                   tRNA-Val   \n",
-       "3      ENSG00000210082                     l-rRNA   \n",
-       "4      ENSG00000209082                   tRNA-Leu   \n",
-       "...                ...                        ...   \n",
-       "66808  ENSG00000283767  double homeobox protein 4   \n",
-       "66809  ENSG00000283884  double homeobox protein 4   \n",
-       "66810  ENSG00000277660        U6 spliceosomal RNA   \n",
-       "66811  ENSG00000283955  double homeobox protein 4   \n",
-       "66812  ENSG00000275405        U1 spliceosomal RNA   \n",
+       "       ensembl_gene_id                                               name  \\\n",
+       "0      ENSG00000210049                                           tRNA-Phe   \n",
+       "1      ENSG00000211459                                             s-rRNA   \n",
+       "2      ENSG00000210077                                           tRNA-Val   \n",
+       "3      ENSG00000210082                                             l-rRNA   \n",
+       "4      ENSG00000209082                                           tRNA-Leu   \n",
+       "...                ...                                                ...   \n",
+       "65003  ENSG00000276518  putative killer cell immunoglobulin-like recep...   \n",
+       "65004  ENSG00000230417         long intergenic non-protein coding RNA 595   \n",
+       "65005  ENSG00000278903                       uncharacterized LOC124905527   \n",
+       "65006  ENSG00000230373                golgin A6 family like 3, pseudogene   \n",
+       "65007  ENSG00000188660                       uncharacterized LOC124900467   \n",
        "\n",
        "                                                   alias  \\\n",
        "0                                                     []   \n",
@@ -2780,11 +2952,11 @@
        "3                                               [MTRNR2]   \n",
        "4                                                [MTTL1]   \n",
        "...                                                  ...   \n",
-       "66808  [LOC124906461, LOC124906459, LOC124906465, LOC...   \n",
-       "66809  [LOC124906461, LOC124906459, LOC124906465, LOC...   \n",
-       "66810                                     [LOC124906683]   \n",
-       "66811  [LOC124906461, LOC124906459, LOC124906465, LOC...   \n",
-       "66812         [LOC124904613, LOC124905809, LOC124905573]   \n",
+       "65003  [LOC128966731, LOC128966733, LOC128966730, LOC...   \n",
+       "65004                             [LINC00595, C10orf101]   \n",
+       "65005                       [LOC124905468, LOC124905312]   \n",
+       "65006                 [GOLGA6L17P, GOLGA6L21P, GOLGA6L3]   \n",
+       "65007                                    [CH507-42P11.6]   \n",
        "\n",
        "                                                 summary        symbol  \\\n",
        "0                                                    NaN          TRNF   \n",
@@ -2793,24 +2965,24 @@
        "3      Enables G protein-coupled receptor binding act...          RNR2   \n",
        "4      Implicated in cardiomyopathy. [provided by All...         TRNL1   \n",
        "...                                                  ...           ...   \n",
-       "66808                                                NaN  LOC124906452   \n",
-       "66809                                                NaN  LOC124906452   \n",
-       "66810                                                NaN  LOC124904108   \n",
-       "66811                                                NaN  LOC124906452   \n",
-       "66812                                                NaN  LOC124905321   \n",
+       "65003                                                NaN  LOC128966722   \n",
+       "65004                                                NaN     LINC00595   \n",
+       "65005                                                NaN  LOC124905527   \n",
+       "65006                                                NaN     GOLGA6L3P   \n",
+       "65007                                                NaN  LOC124900467   \n",
        "\n",
-       "      type_of_gene ensembl_release possible_replacement  \\\n",
-       "0             tRNA             111                   []   \n",
-       "1             rRNA             111                   []   \n",
-       "2             tRNA             111                   []   \n",
-       "3             rRNA             111                   []   \n",
-       "4             tRNA             111                   []   \n",
-       "...            ...             ...                  ...   \n",
-       "66808       pseudo             111                   []   \n",
-       "66809       pseudo             111                   []   \n",
-       "66810        snRNA             111                   []   \n",
-       "66811       pseudo             111                   []   \n",
-       "66812        snRNA             111                   []   \n",
+       "         type_of_gene ensembl_release possible_replacement  \\\n",
+       "0                tRNA             111                   []   \n",
+       "1                rRNA             111                   []   \n",
+       "2                tRNA             111                   []   \n",
+       "3                rRNA             111                   []   \n",
+       "4                tRNA             111                   []   \n",
+       "...               ...             ...                  ...   \n",
+       "65003  protein-coding             111                   []   \n",
+       "65004           ncRNA             111                   []   \n",
+       "65005           ncRNA             111                   []   \n",
+       "65006          pseudo             111                   []   \n",
+       "65007  protein-coding             111                   []   \n",
        "\n",
        "                                               permalink  \n",
        "0      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
@@ -2819,13 +2991,13 @@
        "3      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
        "4      https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
        "...                                                  ...  \n",
-       "66808  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "66809  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "66810  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "66811  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
-       "66812  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "65003  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "65004  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "65005  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "65006  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
+       "65007  https://jan2024.archive.ensembl.org/Homo_sapie...  \n",
        "\n",
-       "[66813 rows x 9 columns]"
+       "[65008 rows x 9 columns]"
       ]
      },
      "execution_count": 26,
diff --git a/test_config.yaml b/test_config.yaml
index 879a6d1d..0341cfca 100644
--- a/test_config.yaml
+++ b/test_config.yaml
@@ -135,7 +135,7 @@ datasets:
   - gene_info:
       files:
         - name: gene_metadata
-          id: syn25953363.10
+          id: syn25953363.11
           format: feather
         - name: igap
           id: syn12514826.5
@@ -178,7 +178,7 @@ datasets:
         possible_replacement: ensembl_possible_replacements
         permalink: ensembl_permalink
       provenance:
-        - syn25953363.10
+        - syn25953363.11
         - syn12514826.5
         - syn12514912.3
         - *agora_proteomics_provenance

From 699defe432cf4ecbcf98cabf47bd369e0a7aad4e Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Tue, 20 Feb 2024 20:17:29 -0800
Subject: [PATCH 4/7] Moved biodomain split operation before grouping operation
 in gene_info transform

---
 src/agoradatatools/etl/transform/gene_info.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py
index 62a06d8d..769019e8 100644
--- a/src/agoradatatools/etl/transform/gene_info.py
+++ b/src/agoradatatools/etl/transform/gene_info.py
@@ -82,6 +82,10 @@ def transform_gene_info(
     )
 
     biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"])
+    biodomains = split_delimited_field_to_multiple_rows(
+        df=biodomains, split_field="ensembl_gene_id", delim=";"
+    )
+
     biodomains = (
         biodomains.groupby("ensembl_gene_id")["biodomain"]
         .apply(set)  # ensure unique biodomain names
@@ -90,10 +94,6 @@ def transform_gene_info(
         .rename(columns={"biodomain": "biodomains"})
     )
 
-    biodomains = split_delimited_field_to_multiple_rows(
-        df=biodomains, split_field="ensembl_gene_id", delim=";"
-    )
-
     # sort biodomains list alphabetically
     biodomains["biodomains"] = biodomains["biodomains"].apply(sorted)
 

From 4d5baef2f9949fe431663397d91063735a42d2c5 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Wed, 21 Feb 2024 10:30:53 -0800
Subject: [PATCH 5/7] Added clarification to the docstring for the new util
 function

---
 src/agoradatatools/etl/utils.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py
index b228ea55..13e57e92 100644
--- a/src/agoradatatools/etl/utils.py
+++ b/src/agoradatatools/etl/utils.py
@@ -225,10 +225,16 @@ def split_delimited_field_to_multiple_rows(
     function creates duplicate rows for each item in that list, with identical data in the other columns. Then the
     target column for these duplicate rows (plus the original row) is assigned a single value from the list, resulting
     in one row per item in the former list.
-    An example of where this function is needed: the genes_biodomains dataset has some semicolon-separated Ensembl IDs
-    in its ensembl_gene_id field, in addition to rows with a single Ensembl ID in the field. For rows with a list of
-    Ensembl IDs, the field is split on ";" and the function creates duplicate rows for each Ensembl ID in the list. Then
-    the ensembl_gene_id field for these duplicates is re-assigned so that there is one Ensembl ID per row.
+    For example, an input data frame that looks like this:
+        biodomain   go_term     ensembl_id
+        Apoptosis   GO:123      ENSG0001
+        Synapse     GO:456      ENSG0002;ENSG0003;ENSG0004
+    will come out looking like this:
+        biodomain   go_term     ensembl_id
+        Apoptosis   GO:123      ENSG0001
+        Synapse     GO:456      ENSG0002
+        Synapse     GO:456      ENSG0003
+        Synapse     GO:456      ENSG0004
 
     Args:
         df (pd.DataFrame): the DataFrame containing a column with delimiter-separated strings. The column can contain a

From 2327fbc4ecc49feab180adc8a5a1098b4c0ef28a Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Wed, 21 Feb 2024 10:47:49 -0800
Subject: [PATCH 6/7] Added test for null values for the split delimited
 function

---
 tests/test_utils.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9f3dbe25..493e5296 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -434,9 +434,22 @@ def test_split_delimited_field_to_multiple_rows_with_regex_delim(self) -> None:
         assert output.equals(self.expected_df)
 
     def test_split_delimited_field_to_multiple_rows_with_no_split(self) -> None:
-        input_df = self.expected_df.copy()
+        input_df = self.expected_df.copy(deep=True)
         output = utils.split_delimited_field_to_multiple_rows(
             df=input_df, split_field="col_1", delim=";"
         )
 
         assert output.equals(self.expected_df)
+
+    def test_split_delimited_field_to_multiple_rows_type_error(self) -> None:
+        input_df = pd.DataFrame(
+            {
+                "col_1": ["a", None],
+                "col_2": ["x", "y"],
+                "col_3": ["1", "2"],
+            }
+        )
+        with pytest.raises(TypeError, match="has no len()"):
+            utils.split_delimited_field_to_multiple_rows(
+                df=input_df, split_field="col_1", delim=";"
+            )

From e6c30d4a11f5c7cb628b9599c4a3179e53872e00 Mon Sep 17 00:00:00 2001
From: Jaclyn Beck <jaclyn.beck@sagebase.org>
Date: Wed, 21 Feb 2024 12:34:47 -0800
Subject: [PATCH 7/7] Changed resource url variables to uppercase to indicate
 constants

---
 src/agoradatatools/etl/transform/gene_info.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py
index 769019e8..88ab7c4d 100644
--- a/src/agoradatatools/etl/transform/gene_info.py
+++ b/src/agoradatatools/etl/transform/gene_info.py
@@ -102,16 +102,16 @@ def transform_gene_info(
     # tep_info file and not the symbol in gene_info, because there are some mismatches
     # between the two and the hgnc_symbol from tep_info is the correct one to use here.
     # resource_url should be NA if both is_adi and is_tep are false.
-    resource_url_prefix = (
+    RESOURCE_URL_PREFIX = (
         "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22"
         + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22"
         + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table."
         + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22"
     )
-    resource_url_suffix = "%22%5D%7D%5D%7D"
+    RESOURCE_URL_SUFFIX = "%22%5D%7D%5D%7D"
     tep_info["resource_url"] = tep_info.apply(
         lambda row: (
-            resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix
+            RESOURCE_URL_PREFIX + row["hgnc_symbol"] + RESOURCE_URL_SUFFIX
             if row["is_adi"] or row["is_tep"]
             else np.NaN
         ),