From 303786e95811848ee2b8268eb365234e9ce2857c Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 16 Feb 2024 18:25:53 -0800 Subject: [PATCH 1/7] Modified genes_biodomains transform to handle semicolon-separated Ensembl IDs, and edited the tests to test that case --- config.yaml | 4 +- .../etl/transform/genes_biodomains.py | 48 +++++++++++++++++++ test_config.yaml | 4 +- .../input/biodomains_test_input.csv | 4 +- ...domains_test_input_bad_but_should_pass.csv | 2 +- .../output/genes_biodomains.json | 27 ++++++----- tests/transform/test_genes_biodomains.py | 20 ++++++++ 7 files changed, 90 insertions(+), 19 deletions(-) diff --git a/config.yaml b/config.yaml index b70165f9..abce93f6 100644 --- a/config.yaml +++ b/config.yaml @@ -4,10 +4,10 @@ sources: - genes_biodomains: genes_biodomains_files: &genes_biodomains_files - name: genes_biodomains - id: syn44151254.1 + id: syn44151254.4 format: csv genes_biodomains_provenance: &genes_biodomains_provenance - - syn44151254.1 + - syn44151254.4 - overall_scores: overall_scores_files: &overall_scores_files - name: overall_scores diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py index 9d278cda..15c44784 100644 --- a/src/agoradatatools/etl/transform/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/genes_biodomains.py @@ -37,6 +37,51 @@ def count_grouped_total( return df +def split_ensembl_ids(genes_biodomains: pd.DataFrame) -> pd.DataFrame: + """The "ensembl_gene_id" column in the genes_biodomains data frame has some single Ensembl IDs and some rows with a + semicolon-separated list of Ensembl IDs. This function finds the rows with semicolons, adds rows to the + genes_biodomains dataframe such that there is one row per Ensembl ID in that list, and assigns a single Ensembl ID + to each row. + + Args: + genes_biodomains (pd.DataFrame): DataFrame containing a column named "ensembl_gene_id" + + Returns: + pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the + "ensembl_gene_id" column only has one Ensembl ID per row. + """ + + # Split the whole column on ";". Rows that don't need to be split will have a length of 1, while rows that do need + # to be split will have 2 or more in the list. + ens_lists = genes_biodomains["ensembl_gene_id"].str.split(pat=";") + needs_split = ens_lists.apply(len) > 1 + + # Edit the rows where needs_split is True, referencing by the DataFrame index + for df_ind in needs_split.index[needs_split]: + ensembl_ids = ens_lists[df_ind] + + # Guard against extra semicolons or ending the string with a semicolon, which will both result in a blank + # character as an Ensembl ID + ensembl_ids = [x for x in ensembl_ids if x != ""] + + # If there is still more than one Ensembl ID in the list after removing '', add as many new rows as there are + # (Ensembl IDs - 1), since there is already 1 row in the data frame for this group of IDs + if len(ensembl_ids) > 1: + row_dupe = genes_biodomains.loc[df_ind].copy().to_frame().T + + genes_biodomains = pd.concat( + [genes_biodomains] + [row_dupe] * (len(ensembl_ids) - 1) + ) + + # The added rows plus the original row all have the same index, so this sets all rows with that index at once. + genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids + + else: + genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids[0] + + return genes_biodomains + + def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: """Takes dictionary of dataset DataFrames, extracts the genes_biodomains DataFrame, calculates some metrics on GO terms per gene / biodomain, and @@ -55,6 +100,9 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"] genes_biodomains = genes_biodomains[interesting_columns].dropna() + genes_biodomains = split_ensembl_ids(genes_biodomains) + genes_biodomains = genes_biodomains.reset_index(drop=True) + # Count the number of go_terms associated with each biodomain n_biodomain_terms = count_grouped_total( genes_biodomains, "biodomain", "go_terms", "n_biodomain_terms" diff --git a/test_config.yaml b/test_config.yaml index 9e0d7872..879a6d1d 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -4,10 +4,10 @@ sources: - genes_biodomains: genes_biodomains_files: &genes_biodomains_files - name: genes_biodomains - id: syn44151254.1 + id: syn44151254.4 format: csv genes_biodomains_provenance: &genes_biodomains_provenance - - syn44151254.1 + - syn44151254.4 - overall_scores: overall_scores_files: &overall_scores_files - name: overall_scores diff --git a/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv b/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv index 61233b9d..221f49f1 100644 --- a/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv +++ b/tests/test_assets/genes_biodomains/input/biodomains_test_input.csv @@ -2,7 +2,7 @@ biodomain,abbr,label,color,go_id,go_terms,ensembl_gene_id Autophagy,Au,Autophagy [Au],#9931fd,GO:0006914,autophagy,ENSG00000161011 Autophagy,Au,Autophagy [Au],#9931fd,GO:0016236,macroautophagy,ENSG00000161011 Autophagy,Au,Autophagy [Au],#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011 -Autophagy,Au,Autophagy [Au],#9931fd,GO:0000423,mitophagy,ENSG00000161011 +Autophagy,Au,Autophagy [Au],#9931fd,GO:0000423,mitophagy,ENSG00000161011;ENSG00000000938 Autophagy,Au,Autophagy [Au],#9931fd,GO:0035973,aggrephagy,ENSG00000161011 Autophagy,Au,Autophagy [Au],#9931fd,GO:0038096,Fc-gamma receptor signaling pathway involved in phagocytosis,ENSG00000000938 Autophagy,Au,Autophagy [Au],#9931fd,GO:0050764,regulation of phagocytosis,ENSG00000000938 @@ -34,4 +34,4 @@ Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0070037,rRNA (pseudouridine) methyl Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0031625,ubiquitin protein ligase binding,ENSG00000161011 Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0043130,ubiquitin binding,ENSG00000161011 Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000161011 -Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000000938 +Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0016235,aggresome,ENSG00000000938;ENSG00000188157;ENSG00000290146 diff --git a/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv b/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv index 659ff314..31360d47 100644 --- a/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv +++ b/tests/test_assets/genes_biodomains/input/biodomains_test_input_bad_but_should_pass.csv @@ -1,7 +1,7 @@ biodomain,abbr,label,color,go_id,go_terms,ensembl_gene_id ,Au,Autophagy [Au],#9931fd,GO:0006914,autophagy,ENSG00000161011 Autophagy,,Autophagy [Au],#9931fd,GO:0016236,macroautophagy,ENSG00000161011 -Autophagy,Au,,#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011 +Autophagy,Au,,#9931fd,GO:0000422,autophagy of mitochondrion,ENSG00000161011; Autophagy,Au,Autophagy [Au],,GO:0000423,mitophagy,ENSG00000161011 Autophagy,Au,Autophagy [Au],#9931fd,,aggrephagy,ENSG00000161011 Autophagy,Au,Autophagy [Au],#9931fd,GO:0038096,,ENSG00000000938 diff --git a/tests/test_assets/genes_biodomains/output/genes_biodomains.json b/tests/test_assets/genes_biodomains/output/genes_biodomains.json index 04eab60c..74a4bd63 100644 --- a/tests/test_assets/genes_biodomains/output/genes_biodomains.json +++ b/tests/test_assets/genes_biodomains/output/genes_biodomains.json @@ -6,11 +6,12 @@ "biodomain": "Autophagy", "go_terms": [ "Fc-gamma receptor signaling pathway involved in phagocytosis", - "regulation of phagocytosis" + "regulation of phagocytosis", + "mitophagy" ], "n_biodomain_terms": 10, - "n_gene_biodomain_terms": 2, - "pct_linking_terms": 28.57 + "n_gene_biodomain_terms": 3, + "pct_linking_terms": 37.50 }, { "biodomain": "Mitochondrial Metabolism", @@ -21,7 +22,7 @@ ], "n_biodomain_terms": 7, "n_gene_biodomain_terms": 3, - "pct_linking_terms": 42.86 + "pct_linking_terms": 37.50 }, { "biodomain": "Proteostasis", @@ -30,7 +31,7 @@ ], "n_biodomain_terms": 8, "n_gene_biodomain_terms": 1, - "pct_linking_terms": 14.29 + "pct_linking_terms": 12.50 }, { "biodomain": "Synapse", @@ -39,7 +40,7 @@ ], "n_biodomain_terms": 9, "n_gene_biodomain_terms": 1, - "pct_linking_terms": 14.29 + "pct_linking_terms": 12.50 } ] }, @@ -105,11 +106,12 @@ { "biodomain": "Proteostasis", "go_terms": [ - "Golgi lumen" + "Golgi lumen", + "aggresome" ], "n_biodomain_terms": 8, - "n_gene_biodomain_terms": 1, - "pct_linking_terms": 14.29 + "n_gene_biodomain_terms": 2, + "pct_linking_terms": 25.00 }, { "biodomain": "Synapse", @@ -123,7 +125,7 @@ ], "n_biodomain_terms": 9, "n_gene_biodomain_terms": 6, - "pct_linking_terms": 85.71 + "pct_linking_terms": 75.00 } ] }, @@ -135,10 +137,11 @@ "go_terms": [ "ribosome biogenesis", "rRNA base methylation", - "rRNA (pseudouridine) methyltransferase activity" + "rRNA (pseudouridine) methyltransferase activity", + "aggresome" ], "n_biodomain_terms": 8, - "n_gene_biodomain_terms": 3, + "n_gene_biodomain_terms": 4, "pct_linking_terms": 100.0 } ] diff --git a/tests/transform/test_genes_biodomains.py b/tests/transform/test_genes_biodomains.py index 799853c8..fa5d976b 100644 --- a/tests/transform/test_genes_biodomains.py +++ b/tests/transform/test_genes_biodomains.py @@ -43,6 +43,26 @@ def test_count_grouped_total_two_groups(self): assert counted.equals(expected_df) +def test_split_ensembl_ids(): + input_df = pd.DataFrame( + { + "ensembl_gene_id": ["a", "a;d", "b;", "b;c;d;e;f"], # 'Ensembl IDs' + "col_2": ["x", "y", "z", "x"], # 3 'biodomains' + "col_3": ["1", "2", "3", "4"], # 4 'go_terms' + } + ) + expected_df = pd.DataFrame( + { + "ensembl_gene_id": ["a", "a", "b", "b", "d", "c", "d", "e", "f"], + "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"], + "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"], + } + ) + output = genes_biodomains.split_ensembl_ids(genes_biodomains=input_df) + output = output.reset_index(drop=True) # reset needed so indices match + assert output.equals(expected_df) + + class TestTransformGenesBiodomains: data_files_path = "tests/test_assets/genes_biodomains" pass_test_data = [ From ca08eff1ea10152d33dc610efd2ed55f65a0c867 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Tue, 20 Feb 2024 16:48:29 -0800 Subject: [PATCH 2/7] Moved split_ensembl_ids function from biodomains transform to utils file, made it more generic, updated its test, plus sonarcloud delinting --- src/agoradatatools/etl/transform/gene_info.py | 42 +++++-- .../etl/transform/genes_biodomains.py | 85 +++---------- src/agoradatatools/etl/utils.py | 65 +++++++++- tests/test_utils.py | 112 ++++++++++++++---- tests/transform/test_genes_biodomains.py | 36 ++---- 5 files changed, 211 insertions(+), 129 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index bc30dcd4..62a06d8d 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -1,12 +1,12 @@ import numpy as np import pandas as pd -from agoradatatools.etl.utils import nest_fields +from agoradatatools.etl.utils import nest_fields, split_delimited_field_to_multiple_rows def transform_gene_info( - datasets: dict, adjusted_p_value_threshold, protein_level_threshold -): + datasets: dict, adjusted_p_value_threshold: float, protein_level_threshold: float +) -> pd.DataFrame: """ This function will perform transformations and incrementally create a dataset called gene_info. Each dataset will be left_joined onto gene_info, starting with gene_metadata. @@ -81,6 +81,7 @@ def transform_gene_info( drop_columns=["ensembl_gene_id"], ) + biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"]) biodomains = ( biodomains.groupby("ensembl_gene_id")["biodomain"] .apply(set) # ensure unique biodomain names @@ -89,6 +90,10 @@ def transform_gene_info( .rename(columns={"biodomain": "biodomains"}) ) + biodomains = split_delimited_field_to_multiple_rows( + df=biodomains, split_field="ensembl_gene_id", delim=";" + ) + # sort biodomains list alphabetically biodomains["biodomains"] = biodomains["biodomains"].apply(sorted) @@ -97,12 +102,19 @@ def transform_gene_info( # tep_info file and not the symbol in gene_info, because there are some mismatches # between the two and the hgnc_symbol from tep_info is the correct one to use here. # resource_url should be NA if both is_adi and is_tep are false. - resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" + resource_url_prefix = ( + "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22" + + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22" + + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table." + + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" + ) resource_url_suffix = "%22%5D%7D%5D%7D" tep_info["resource_url"] = tep_info.apply( - lambda row: resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix - if row["is_adi"] or row["is_tep"] - else np.NaN, + lambda row: ( + resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix + if row["is_adi"] or row["is_tep"] + else np.NaN + ), axis=1, ) @@ -161,9 +173,11 @@ def transform_gene_info( # fillna doesn't work for creating an empty array, need this function instead gene_info["alias"] = gene_info.apply( - lambda row: row["alias"] - if isinstance(row["alias"], np.ndarray) - else np.ndarray(0, dtype=object), + lambda row: ( + row["alias"] + if isinstance(row["alias"], np.ndarray) + else np.ndarray(0, dtype=object) + ), axis=1, ) @@ -179,9 +193,11 @@ def transform_gene_info( # create 'total_nominations' field gene_info["total_nominations"] = gene_info.apply( - lambda row: len(row["target_nominations"]) - if isinstance(row["target_nominations"], list) - else np.NaN, + lambda row: ( + len(row["target_nominations"]) + if isinstance(row["target_nominations"], list) + else np.NaN + ), axis=1, ) diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py index 15c44784..8d778c7c 100644 --- a/src/agoradatatools/etl/transform/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/genes_biodomains.py @@ -2,7 +2,7 @@ import pandas as pd -from agoradatatools.etl.utils import nest_fields +from agoradatatools.etl.utils import nest_fields, split_delimited_field_to_multiple_rows def count_grouped_total( @@ -11,22 +11,19 @@ def count_grouped_total( input_colname: str, output_colname: str, ) -> pd.DataFrame: - """For each unique item/combination in the column(s) specified by grouping, - counts the number of unique items in the column [input_colname] that - correspond to that grouping. The calculated counts are put in a new - column and named with [output_colname]. + """For each unique item/combination in the column(s) specified by grouping, counts the number of unique items in the + column [input_colname] that correspond to that grouping. The calculated counts are put in a new column and named + with [output_colname]. + Args: - df (pd.DataFrame): contains columns listed in grouping and - input_colname. May contain other columns as well, but + df (pd.DataFrame): contains columns listed in grouping and input_colname. May contain other columns as well, but these will be dropped from the returned data frame. - grouping (str or list): a string with a single column name, or a list of - strings for multiple column names + grouping (str or list): a string with a single column name, or a list of strings for multiple column names input_colname (str): the name of the column to count output_colname (str): the name of the new column with calculated counts Returns: - pd.DataFrame: a data frame containing the grouping column(s) and a - new column for output_colname, which contains the count of - unique items in input_colname for each grouping item. + pd.DataFrame: a data frame containing the grouping column(s) and a new column for output_colname, which contains + the count of unique items in input_colname for each grouping item. """ df = ( df.groupby(grouping)[input_colname] @@ -37,71 +34,25 @@ def count_grouped_total( return df -def split_ensembl_ids(genes_biodomains: pd.DataFrame) -> pd.DataFrame: - """The "ensembl_gene_id" column in the genes_biodomains data frame has some single Ensembl IDs and some rows with a - semicolon-separated list of Ensembl IDs. This function finds the rows with semicolons, adds rows to the - genes_biodomains dataframe such that there is one row per Ensembl ID in that list, and assigns a single Ensembl ID - to each row. - - Args: - genes_biodomains (pd.DataFrame): DataFrame containing a column named "ensembl_gene_id" - - Returns: - pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the - "ensembl_gene_id" column only has one Ensembl ID per row. - """ - - # Split the whole column on ";". Rows that don't need to be split will have a length of 1, while rows that do need - # to be split will have 2 or more in the list. - ens_lists = genes_biodomains["ensembl_gene_id"].str.split(pat=";") - needs_split = ens_lists.apply(len) > 1 - - # Edit the rows where needs_split is True, referencing by the DataFrame index - for df_ind in needs_split.index[needs_split]: - ensembl_ids = ens_lists[df_ind] - - # Guard against extra semicolons or ending the string with a semicolon, which will both result in a blank - # character as an Ensembl ID - ensembl_ids = [x for x in ensembl_ids if x != ""] - - # If there is still more than one Ensembl ID in the list after removing '', add as many new rows as there are - # (Ensembl IDs - 1), since there is already 1 row in the data frame for this group of IDs - if len(ensembl_ids) > 1: - row_dupe = genes_biodomains.loc[df_ind].copy().to_frame().T - - genes_biodomains = pd.concat( - [genes_biodomains] + [row_dupe] * (len(ensembl_ids) - 1) - ) - - # The added rows plus the original row all have the same index, so this sets all rows with that index at once. - genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids - - else: - genes_biodomains.at[df_ind, "ensembl_gene_id"] = ensembl_ids[0] - - return genes_biodomains - - def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: - """Takes dictionary of dataset DataFrames, extracts the genes_biodomains - DataFrame, calculates some metrics on GO terms per gene / biodomain, and - performs nest_fields on the final DataFrame. This results in a 2 column - DataFrame grouped by "ensembl_gene_id" and includes a collapsed nested - dictionary field "gene_biodomains" + """Takes dictionary of dataset DataFrames, extracts the genes_biodomains DataFrame, calculates some metrics on GO + terms per gene / biodomain, and performs nest_fields on the final DataFrame. This results in a 2 column DataFrame + grouped by "ensembl_gene_id" and includes a collapsed nested dictionary field "gene_biodomains" Args: datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame Returns: - pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including - a collapsed nested dictionary field "gene_biodomains" + pd.DataFrame: 2 column DataFrame grouped by "ensembl_gene_id" including a collapsed nested dictionary field + "gene_biodomains" """ genes_biodomains = datasets["genes_biodomains"] interesting_columns = ["ensembl_gene_id", "biodomain", "go_terms"] genes_biodomains = genes_biodomains[interesting_columns].dropna() - genes_biodomains = split_ensembl_ids(genes_biodomains) - genes_biodomains = genes_biodomains.reset_index(drop=True) + genes_biodomains = split_delimited_field_to_multiple_rows( + df=genes_biodomains, split_field="ensembl_gene_id", delim=";" + ) # Count the number of go_terms associated with each biodomain n_biodomain_terms = count_grouped_total( @@ -151,7 +102,7 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: df=genes_biodomains, grouping="ensembl_gene_id", new_column="gene_biodomains", - drop_columns="ensembl_gene_id", + drop_columns=["ensembl_gene_id"], ) return genes_biodomains diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py index 073abe65..b228ea55 100644 --- a/src/agoradatatools/etl/utils.py +++ b/src/agoradatatools/etl/utils.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Pattern import numpy as np import pandas as pd @@ -110,7 +110,6 @@ def rename_columns(df: pd.DataFrame, column_map: dict) -> pd.DataFrame: df.rename(columns=column_map, inplace=True) except TypeError: print("Column mapping must be a dictionary") - return df return df @@ -180,7 +179,7 @@ def calculate_distribution( Args: df (pd.DataFrame): the DataFrame to calculate distribution for - grouping (str or list of str): the column(s) to group the data frame on (example: "tissue" or ["tissue", "model"]) + grouping (str or list[str]): the column(s) to group the data frame on (example: "tissue" or ["tissue", "model"]) distribution_column (str): the name of the column to calculate distribution on (example: "logfc") Returns: @@ -215,3 +214,63 @@ def calculate_distribution( df.drop("IQR", axis=1, inplace=True) return df + + +def split_delimited_field_to_multiple_rows( + df: pd.DataFrame, split_field: str, delim: Union[str, Pattern] +) -> pd.DataFrame: + """This function takes a dataframe with a column that contains delimiter-separated strings in some or all rows + (instead of a single value), splits those strings on the delimiter, and expands the dataframe so that each item in + the resulting list has its own row. For each row containing a delimiter-separated string in the target column, this + function creates duplicate rows for each item in that list, with identical data in the other columns. Then the + target column for these duplicate rows (plus the original row) is assigned a single value from the list, resulting + in one row per item in the former list. + An example of where this function is needed: the genes_biodomains dataset has some semicolon-separated Ensembl IDs + in its ensembl_gene_id field, in addition to rows with a single Ensembl ID in the field. For rows with a list of + Ensembl IDs, the field is split on ";" and the function creates duplicate rows for each Ensembl ID in the list. Then + the ensembl_gene_id field for these duplicates is re-assigned so that there is one Ensembl ID per row. + + Args: + df (pd.DataFrame): the DataFrame containing a column with delimiter-separated strings. The column can contain a + combination of rows with single values and rows with delimited strings. Every row in the + column should be a string, not a Python list. + split_field (str): the name of the column with the strings to split up + delim (str or Pattern): the delimiter to split the column values on. This may be a string with a single + character (e.g. ","), a string of multiple characters (e.g. ", "), or a compiled regex + (e.g. re.compile("[,;-_]")) + + Returns: + pd.DataFrame: a DataFrame with the same columns as the input but with additional rows added, plus the + split_field column only has one value per row. + """ + + # Split the whole column on the delimiter. Rows that don't need to be split will have a list of length of 1, while + # rows that do need to be split will have 2 or more in the list. + split_lists = df[split_field].str.split(pat=delim) + needs_split = split_lists.apply(len) > 1 + + # Edit the rows where needs_split is True, referencing by the DataFrame index + for df_ind in needs_split.index[needs_split]: + split_items = split_lists[df_ind] + + # Guard against extra delimiters or ending the string with a delimiter, which will both result in a blank + # character as a list item + split_items = [x for x in split_items if x != ""] + + # If there is still more than one item in the list after removing '', add as many new rows as there are + # (items - 1), since there is already 1 row in the data frame for this group of items + if len(split_items) > 1: + row_dupe = df.loc[df_ind].copy().to_frame().T + + df = pd.concat([df] + [row_dupe] * (len(split_items) - 1)) + + # The added rows plus the original row all have the same index, so this sets all rows with that index at + # once. + df.at[df_ind, split_field] = split_items + + # Otherwise change the value in split_field to the only item left in the list, which will erase the extra + # delimiters + else: + df.at[df_ind, split_field] = split_items[0] + + return df.reset_index(drop=True) diff --git a/tests/test_utils.py b/tests/test_utils.py index d205dce9..9f3dbe25 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,62 +8,63 @@ import pytest import synapseclient import yaml +import re from agoradatatools.etl import utils class TestLoginToSynapse: @pytest.fixture(scope="function", autouse=True) - def setup_method(self, syn): + def setup_method(self, syn: synapseclient.Synapse) -> None: self.patch_synapseclient = patch.object( synapseclient, "Synapse", return_value=syn ).start() self.patch_syn_login = patch.object(syn, "login", return_value=syn).start() - def teardown_method(self): + def teardown_method(self) -> None: mock.patch.stopall() - def test_login_with_token(self): + def test_login_with_token(self) -> None: utils._login_to_synapse(token="my_auth_token") self.patch_synapseclient.assert_called_once() self.patch_syn_login.assert_called_once_with(authToken="my_auth_token") - def test_login_no_token(self): + def test_login_no_token(self) -> None: utils._login_to_synapse(token=None) self.patch_synapseclient.assert_called_once() self.patch_syn_login.assert_called_once_with() -def test_get_config_with_invalid_file_path(): +def test_get_config_with_invalid_file_path() -> None: with pytest.raises(FileNotFoundError, match="File not found. *"): utils._get_config(config_path="this/is/a/bad/path") -def test_get_config_with_parser_error(): +def test_get_config_with_parser_error() -> None: with pytest.raises( yaml.parser.ParserError, match="YAML file unable to be parsed. *" ): utils._get_config(config_path="./tests/test_assets/bad_config_parsing.yaml") -def test_get_config_with_scanner_error(): +def test_get_config_with_scanner_error() -> None: with pytest.raises( yaml.scanner.ScannerError, match="YAML file unable to be scanned. *" ): utils._get_config(config_path="./tests/test_assets/bad_config_scanning.yaml") -def test_get_config_with_no_config_path(): +def test_get_config_with_no_config_path() -> None: config = utils._get_config(config_path=None) assert config["destination"] == "syn12177492" -def test_get_config_with_config_path(): +def test_get_config_with_config_path() -> None: config = utils._get_config(config_path="./test_config.yaml") assert config["destination"] == "syn17015333" -def test_standardize_column_names(): +def test_standardize_column_names() -> None: df = pd.DataFrame( { "a#": ["test_value"], @@ -117,12 +118,12 @@ class TestStandardizeValues: } ) - def test_standardize_values_success(self): + def test_standardize_values_success(self) -> None: standard_df = utils.standardize_values(df=self.df.copy()) for value in standard_df.iloc[0].tolist(): assert np.isnan(value) - def test_standardize_values_TypeError(self): + def test_standardize_values_TypeError(self) -> None: with patch.object(pd.DataFrame, "replace") as patch_replace: patch_replace.side_effect = TypeError captured_output = StringIO() @@ -144,13 +145,13 @@ class TestRenameColumns: good_column_map = {"a": "e", "b": "f", "c": "g", "d": "h"} bad_column_map = [] - def test_rename_columns_success(self): + def test_rename_columns_success(self) -> None: renamed_df = utils.rename_columns( df=self.df.copy(), column_map=self.good_column_map ) assert list(renamed_df.columns) == list(self.good_column_map.values()) - def test_rename_columns_TypeError(self): + def test_rename_columns_TypeError(self) -> None: captured_output = StringIO() sys.stdout = captured_output bad_renamed_df = utils.rename_columns( @@ -182,7 +183,7 @@ class TestNestFields: } ) - def test_nest_fields_with_dropped_column(self): + def test_nest_fields_with_dropped_column(self) -> None: expected_column_e = [ [ {"a": "group_1", "b": "1", "c": "1"}, @@ -203,7 +204,7 @@ def test_nest_fields_with_dropped_column(self): ) assert list(nested_df["e"]) == expected_column_e - def test_nest_fields_with_dropped_column_list(self): + def test_nest_fields_with_dropped_column_list(self) -> None: expected_column_e = [ [ {"a": "group_1", "c": "1"}, @@ -224,7 +225,7 @@ def test_nest_fields_with_dropped_column_list(self): ) assert list(nested_df["e"]) == expected_column_e - def test_nest_fields_no_drop_column(self): + def test_nest_fields_no_drop_column(self) -> None: expected_column_e = [ [ {"a": "group_1", "b": "1", "c": "1", "d": "1"}, @@ -243,7 +244,7 @@ def test_nest_fields_no_drop_column(self): nested_df = utils.nest_fields(df=self.df_multirow, grouping="a", new_column="e") assert list(nested_df["e"]) == expected_column_e - def test_nest_fields_multirow_ValueError(self): + def test_nest_fields_multirow_ValueError(self) -> None: with pytest.raises(ValueError, match="nested_field_is_list *"): utils.nest_fields( df=self.df_multirow, @@ -253,7 +254,7 @@ def test_nest_fields_multirow_ValueError(self): nested_field_is_list=False, ) - def test_nest_fields_singlerow_nested_list_false(self): + def test_nest_fields_singlerow_nested_list_false(self) -> None: expected_column_e = [ {"a": "group_1", "b": "1", "c": "1"}, {"a": "group_2", "b": "1", "c": "1"}, @@ -337,7 +338,7 @@ class TestCalculateDistribution: ) # Stats on "col_3", grouped by "col_1" only - def test_calculate_distribution_one_group(self): + def test_calculate_distribution_one_group(self) -> None: expected_df = pd.DataFrame( { "col_1": ["a", "b", "c"], @@ -354,7 +355,7 @@ def test_calculate_distribution_one_group(self): assert output_df.equals(expected_df) # Stats on "col_3", grouped by "col_1" and "col_2" - def test_calculate_distribution_two_groups(self): + def test_calculate_distribution_two_groups(self) -> None: expected_df = pd.DataFrame( { "col_1": ["a", "a", "b", "c", "c"], @@ -370,3 +371,72 @@ def test_calculate_distribution_two_groups(self): df=self.df, grouping=["col_1", "col_2"], distribution_column="col_3" ) assert output_df.equals(expected_df) + + +class TestSplitDelimitedFieldToMultipleRows: + """Tests the split_delimited_field_to_multiple_rows function with the 'delim' argument being either a string or a + compiled regex. It also tests the case where this function is called but no values need to be split up. + """ + + expected_df = pd.DataFrame( + { + "col_1": ["a", "a", "b", "b", "d", "c", "d", "e f", "g"], + "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"], + "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"], + } + ) + + def test_split_delimited_field_to_multiple_rows_with_character_delim(self) -> None: + input_df = pd.DataFrame( + { + "col_1": ["a", "a;d", "b;", "b;c;d;e f;g"], # 'Ensembl IDs' + "col_2": ["x", "y", "z", "x"], # 3 'biodomains' + "col_3": ["1", "2", "3", "4"], # 4 'go_terms' + } + ) + + output = utils.split_delimited_field_to_multiple_rows( + df=input_df, split_field="col_1", delim=";" + ) + + assert output.equals(self.expected_df) + + def test_split_delimited_field_to_multiple_rows_with_string_delim(self) -> None: + # The ", " pattern should split on the ", " but not the space in "e f" + input_df = pd.DataFrame( + { + "col_1": ["a", "a, d", "b, ", "b, c, d, e f, g"], # 'Ensembl IDs' + "col_2": ["x", "y", "z", "x"], # 3 'biodomains' + "col_3": ["1", "2", "3", "4"], # 4 'go_terms' + } + ) + + output = utils.split_delimited_field_to_multiple_rows( + df=input_df, split_field="col_1", delim=", " + ) + + assert output.equals(self.expected_df) + + def test_split_delimited_field_to_multiple_rows_with_regex_delim(self) -> None: + input_df = pd.DataFrame( + { + "col_1": ["a", "a;d", "b-", "b_c_d;e f;g"], # 'Ensembl IDs' + "col_2": ["x", "y", "z", "x"], # 3 'biodomains' + "col_3": ["1", "2", "3", "4"], # 4 'go_terms' + } + ) + + pattern = re.compile("[;_-]") + output = utils.split_delimited_field_to_multiple_rows( + df=input_df, split_field="col_1", delim=pattern + ) + + assert output.equals(self.expected_df) + + def test_split_delimited_field_to_multiple_rows_with_no_split(self) -> None: + input_df = self.expected_df.copy() + output = utils.split_delimited_field_to_multiple_rows( + df=input_df, split_field="col_1", delim=";" + ) + + assert output.equals(self.expected_df) diff --git a/tests/transform/test_genes_biodomains.py b/tests/transform/test_genes_biodomains.py index fa5d976b..c05af11c 100644 --- a/tests/transform/test_genes_biodomains.py +++ b/tests/transform/test_genes_biodomains.py @@ -1,3 +1,4 @@ +""" Integration tests for the genes_biodomains transform """ import os import pandas as pd @@ -7,6 +8,8 @@ class TestCountGroupedTotal: + """Tests the count_grouped_total method two ways: grouping by one column only, and grouping by two columns.""" + df = pd.DataFrame( { "col_1": ["a", "a", "a", "b", "c", "c", "c"], # 3 'Ensembl IDs' @@ -17,7 +20,7 @@ class TestCountGroupedTotal: ) # How many unique "col_2"'s per unique "col_1" value? - def test_count_grouped_total_one_group(self): + def test_count_grouped_total_one_group(self) -> None: expected_df = pd.DataFrame({"col_1": ["a", "b", "c"], "output": [3, 1, 2]}) counted = genes_biodomains.count_grouped_total( df=self.df, grouping="col_1", input_colname="col_2", output_colname="output" @@ -25,7 +28,7 @@ def test_count_grouped_total_one_group(self): assert counted.equals(expected_df) # How many unique "col_3"'s per unique combination of "col_1" + "col_2"? - def test_count_grouped_total_two_groups(self): + def test_count_grouped_total_two_groups(self) -> None: expected_df = pd.DataFrame( { "col_1": ["a", "a", "a", "b", "c", "c"], @@ -43,27 +46,10 @@ def test_count_grouped_total_two_groups(self): assert counted.equals(expected_df) -def test_split_ensembl_ids(): - input_df = pd.DataFrame( - { - "ensembl_gene_id": ["a", "a;d", "b;", "b;c;d;e;f"], # 'Ensembl IDs' - "col_2": ["x", "y", "z", "x"], # 3 'biodomains' - "col_3": ["1", "2", "3", "4"], # 4 'go_terms' - } - ) - expected_df = pd.DataFrame( - { - "ensembl_gene_id": ["a", "a", "b", "b", "d", "c", "d", "e", "f"], - "col_2": ["x", "y", "z", "x", "y", "x", "x", "x", "x"], - "col_3": ["1", "2", "3", "4", "2", "4", "4", "4", "4"], - } - ) - output = genes_biodomains.split_ensembl_ids(genes_biodomains=input_df) - output = output.reset_index(drop=True) # reset needed so indices match - assert output.equals(expected_df) - - class TestTransformGenesBiodomains: + """Tests the genes_biodomains custom transform with 'perfect' input, input with missing data, and data that results + in all rows being dropped from the data frame (which causes a failure).""" + data_files_path = "tests/test_assets/genes_biodomains" pass_test_data = [ ( # pass with good data @@ -90,8 +76,8 @@ class TestTransformGenesBiodomains: "input_file, expected_output_file", pass_test_data, ids=pass_test_ids ) def test_transform_genes_biodomains_should_pass( - self, input_file, expected_output_file - ): + self, input_file: str, expected_output_file: str + ) -> None: input_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file)) output_df = genes_biodomains.transform_genes_biodomains( datasets={"genes_biodomains": input_df} @@ -102,7 +88,7 @@ def test_transform_genes_biodomains_should_pass( pd.testing.assert_frame_equal(output_df, expected_df) @pytest.mark.parametrize("input_file", fail_test_data, ids=fail_test_ids) - def test_transform_genes_biodomains_should_fail(self, input_file): + def test_transform_genes_biodomains_should_fail(self, input_file: str) -> None: with pytest.raises( ValueError, match="cannot insert ensembl_gene_id, already exists" ): From 04014dd1f6676729eebaf6e38a5fff1a30669a1c Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Tue, 20 Feb 2024 19:00:27 -0800 Subject: [PATCH 3/7] Updated gene annotation pre-processing to handle new biodomains file, bumped version of gene_table_merged to the new file --- config.yaml | 4 +- .../AG-896_Preprocess_Gene_Annotations.ipynb | 1132 ++++++++++------- test_config.yaml | 4 +- 3 files changed, 656 insertions(+), 484 deletions(-) diff --git a/config.yaml b/config.yaml index abce93f6..6a1393ef 100644 --- a/config.yaml +++ b/config.yaml @@ -135,7 +135,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.10 + id: syn25953363.11 format: feather - name: igap id: syn12514826.5 @@ -178,7 +178,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.10 + - syn25953363.11 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance diff --git a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb index 22446910..23240589 100644 --- a/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb +++ b/data_analysis/agora/notebooks/preprocessing/AG-896_Preprocess_Gene_Annotations.ipynb @@ -103,14 +103,6 @@ "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:rpy2.rinterface_lib.callbacks:R[write to console]: Ensembl site unresponsive, trying useast mirror\n", - "\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -217,7 +209,7 @@ { "data": { "text/plain": [ - "{'genes_biodomains': ('syn44151254.1', 'csv'),\n", + "{'genes_biodomains': ('syn44151254.4', 'csv'),\n", " 'neuropath_regression_results': ('syn22017882.5', 'csv'),\n", " 'proteomics': ('syn18689335.3', 'csv'),\n", " 'proteomics_tmt': ('syn35221005.2', 'csv'),\n", @@ -281,22 +273,6 @@ "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.0.0) is available. Your version (3.1.1) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.0.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -317,7 +293,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "genes_biodomains has an NaN Ensembl ID\n", "target_exp_validation_harmonized has an n/A Ensembl ID\n", "WARNING: no Ensembl ID column found for team_info!\n", "WARNING: no Ensembl ID column found for team_member_info!\n" @@ -325,7 +300,8 @@ } ], "source": [ - "syn = utils._login_to_synapse(token=None) # Assumes you have already logged in with a valid token\n", + "# Assumes you have already logged in with a valid token\n", + "syn = utils._login_to_synapse(token=None)\n", "\n", "# The various column names used to store Ensembl IDs in the files\n", "col_names = [\"ENSG\", \"ensembl_gene_id\", \"GeneID\", \"ensembl_id\"]\n", @@ -346,6 +322,14 @@ " df[[\"geneA_ensembl_gene_id\", \"geneB_ensembl_gene_id\"]]\n", " )[\"value\"]\n", "\n", + " # genes_biodomains is a special case -- the ensembl_id field has some semicolon-separated lists in it\n", + " if file == \"genes_biodomains\":\n", + " df = df[[\"Biodomain\", \"ensembl_id\"]].drop_duplicates().dropna()\n", + " df = utils.split_delimited_field_to_multiple_rows(\n", + " df=df, split_field=\"ensembl_id\", delim=\";\"\n", + " )\n", + " file_ensembl_ids = df[\"ensembl_id\"].drop_duplicates()\n", + "\n", " if file_ensembl_ids is not None:\n", " file_ensembl_list = file_ensembl_list + file_ensembl_ids.tolist()\n", " if \"n/A\" in file_ensembl_ids.tolist():\n", @@ -366,8 +350,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "3627 genes from the data files are missing from Biomart results and will be added.\n", - "66814\n" + "1821 genes from the data files are missing from Biomart results and will be added.\n", + "65009\n" ] } ], @@ -549,9 +533,7 @@ "INFO:biothings.client:done.\n", "INFO:biothings.client:querying 64001-65000...\n", "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 65001-66000...\n", - "INFO:biothings.client:done.\n", - "INFO:biothings.client:querying 66001-66814...\n", + "INFO:biothings.client:querying 65001-65009...\n", "INFO:biothings.client:done.\n" ] }, @@ -705,8 +687,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Annotations found for 66166 genes.\n", - "No annotations found for 1230 genes.\n" + "Annotations found for 63844 genes.\n", + "No annotations found for 1176 genes.\n" ] } ], @@ -743,7 +725,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(67398, 11)\n" + "(65022, 11)\n" ] }, { @@ -927,7 +909,6 @@ " lambda cell: cell if isinstance(cell, list) else [cell]\n", ")\n", "\n", - "\n", "# Some alias values are lists of lists or have duplicate values\n", "def flatten(row):\n", " flattened = []\n", @@ -1068,92 +1049,245 @@ " NaN\n", " \n", " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " 5133\n", + " ENSG00000230417\n", + " 10\n", + " LINC00595\n", + " 414243\n", + " 1.0\n", + " long intergenic non-protein coding RNA 595\n", + " LINC00595\n", + " ncRNA\n", + " [C10orf101]\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 67263\n", - " ENSG00000284262\n", + " 8675\n", + " ENSG00000188660\n", + " 21\n", + " LINC00319\n", + " 124900467\n", + " 1.0\n", + " uncharacterized LOC124900467\n", + " LOC124900467\n", + " protein-coding\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 8676\n", + " ENSG00000188660\n", + " 21\n", + " LINC00319\n", + " 102724398\n", + " 1.0\n", + " uncharacterized CH507-42P11.6\n", + " CH507-42P11.6\n", + " ncRNA\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 12016\n", + " ENSG00000278903\n", + " 21\n", " \n", + " 124905527\n", + " 1.0\n", + " uncharacterized LOC124905527\n", + " LOC124905527\n", + " ncRNA\n", + " []\n", " NaN\n", - " 124906462\n", - " 2.0\n", - " double homeobox protein 4\n", - " LOC124906462\n", + " NaN\n", + " \n", + " \n", + " 12017\n", + " ENSG00000278903\n", + " 21\n", + " \n", + " 124905312\n", + " 1.0\n", + " uncharacterized LOC124905312\n", + " LOC124905312\n", + " ncRNA\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 12018\n", + " ENSG00000278903\n", + " 21\n", + " \n", + " 124905468\n", + " 1.0\n", + " uncharacterized LOC124905468\n", + " LOC124905468\n", + " ncRNA\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 43337\n", + " ENSG00000249738\n", + " 5\n", + " \n", + " 105377683\n", + " 1.0\n", + " uncharacterized LOC105377683\n", + " LOC105377683\n", + " ncRNA\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 43338\n", + " ENSG00000249738\n", + " 5\n", + " \n", + " 285626\n", + " 1.0\n", + " uncharacterized LOC285626\n", + " LOC285626\n", + " ncRNA\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 43519\n", + " ENSG00000230373\n", + " 15\n", + " GOLGA6L5P\n", + " 100133220\n", + " 1.0\n", + " golgin A6 family like 3, pseudogene\n", + " GOLGA6L3P\n", + " pseudo\n", + " [GOLGA6L3]\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 43520\n", + " ENSG00000230373\n", + " 15\n", + " GOLGA6L5P\n", + " 642402\n", + " 1.0\n", + " golgin A6 family like 17, pseudogene\n", + " GOLGA6L17P\n", " pseudo\n", + " [GOLGA6L21P]\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 63071\n", + " ENSG00000293331\n", + " 1\n", + " \n", + " 101928626\n", + " 2.0\n", + " uncharacterized LOC101928626\n", + " LOC101928626\n", + " ncRNA\n", " []\n", " NaN\n", " NaN\n", " \n", " \n", - " 67264\n", - " ENSG00000284262\n", + " 63072\n", + " ENSG00000293331\n", + " 1\n", " \n", + " 124901156\n", + " 2.0\n", + " uncharacterized LOC124901156\n", + " LOC124901156\n", + " ncRNA\n", + " []\n", " NaN\n", - " 124906463\n", + " NaN\n", + " \n", + " \n", + " 63715\n", + " ENSG00000276518\n", + " \n", + " NaN\n", + " 128966722\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906463\n", - " pseudo\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966722\n", + " protein-coding\n", " []\n", " NaN\n", " NaN\n", " \n", " \n", - " 67265\n", - " ENSG00000284262\n", + " 63716\n", + " ENSG00000276518\n", " \n", " NaN\n", - " 124906464\n", + " 128966730\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906464\n", - " pseudo\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966730\n", + " protein-coding\n", " []\n", " NaN\n", " NaN\n", " \n", " \n", - " 67266\n", - " ENSG00000284262\n", + " 63717\n", + " ENSG00000276518\n", " \n", " NaN\n", - " 124906465\n", + " 128966732\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906465\n", - " pseudo\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966732\n", + " protein-coding\n", " []\n", " NaN\n", " NaN\n", " \n", " \n", - " 67267\n", - " ENSG00000284262\n", + " 63718\n", + " ENSG00000276518\n", " \n", " NaN\n", - " 124906466\n", + " 128966731\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906466\n", - " pseudo\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966731\n", + " protein-coding\n", + " []\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 63719\n", + " ENSG00000276518\n", + " \n", + " NaN\n", + " 128966733\n", + " 2.0\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966733\n", + " protein-coding\n", " []\n", " NaN\n", " NaN\n", " \n", " \n", "\n", - "

614 rows × 11 columns

\n", "" ], "text/plain": [ @@ -1163,40 +1297,71 @@ "5130 ENSG00000230417 10 LINC00856 414243 1.0 \n", "5131 ENSG00000230417 10 LINC00856 414243 1.0 \n", "5132 ENSG00000230417 10 LINC00595 414243 1.0 \n", - "... ... ... ... ... ... \n", - "67263 ENSG00000284262 NaN 124906462 2.0 \n", - "67264 ENSG00000284262 NaN 124906463 2.0 \n", - "67265 ENSG00000284262 NaN 124906464 2.0 \n", - "67266 ENSG00000284262 NaN 124906465 2.0 \n", - "67267 ENSG00000284262 NaN 124906466 2.0 \n", - "\n", - " name symbol type_of_gene \\\n", - "4089 uncharacterized LOC101927042 LOC101927042 ncRNA \n", - "4090 uncharacterized LOC124902157 LOC124902157 ncRNA \n", - "5130 long intergenic non-protein coding RNA 595 LINC00595 ncRNA \n", - "5131 long intergenic non-protein coding RNA 595 LINC00595 ncRNA \n", - "5132 long intergenic non-protein coding RNA 595 LINC00595 ncRNA \n", - "... ... ... ... \n", - "67263 double homeobox protein 4 LOC124906462 pseudo \n", - "67264 double homeobox protein 4 LOC124906463 pseudo \n", - "67265 double homeobox protein 4 LOC124906464 pseudo \n", - "67266 double homeobox protein 4 LOC124906465 pseudo \n", - "67267 double homeobox protein 4 LOC124906466 pseudo \n", + "5133 ENSG00000230417 10 LINC00595 414243 1.0 \n", + "8675 ENSG00000188660 21 LINC00319 124900467 1.0 \n", + "8676 ENSG00000188660 21 LINC00319 102724398 1.0 \n", + "12016 ENSG00000278903 21 124905527 1.0 \n", + "12017 ENSG00000278903 21 124905312 1.0 \n", + "12018 ENSG00000278903 21 124905468 1.0 \n", + "43337 ENSG00000249738 5 105377683 1.0 \n", + "43338 ENSG00000249738 5 285626 1.0 \n", + "43519 ENSG00000230373 15 GOLGA6L5P 100133220 1.0 \n", + "43520 ENSG00000230373 15 GOLGA6L5P 642402 1.0 \n", + "63071 ENSG00000293331 1 101928626 2.0 \n", + "63072 ENSG00000293331 1 124901156 2.0 \n", + "63715 ENSG00000276518 NaN 128966722 2.0 \n", + "63716 ENSG00000276518 NaN 128966730 2.0 \n", + "63717 ENSG00000276518 NaN 128966732 2.0 \n", + "63718 ENSG00000276518 NaN 128966731 2.0 \n", + "63719 ENSG00000276518 NaN 128966733 2.0 \n", "\n", - " alias summary notfound \n", - "4089 [] NaN NaN \n", - "4090 [] NaN NaN \n", - "5130 [C10orf101] NaN NaN \n", - "5131 [C10orf101] NaN NaN \n", - "5132 [C10orf101] NaN NaN \n", - "... ... ... ... \n", - "67263 [] NaN NaN \n", - "67264 [] NaN NaN \n", - "67265 [] NaN NaN \n", - "67266 [] NaN NaN \n", - "67267 [] NaN NaN \n", + " name symbol \\\n", + "4089 uncharacterized LOC101927042 LOC101927042 \n", + "4090 uncharacterized LOC124902157 LOC124902157 \n", + "5130 long intergenic non-protein coding RNA 595 LINC00595 \n", + "5131 long intergenic non-protein coding RNA 595 LINC00595 \n", + "5132 long intergenic non-protein coding RNA 595 LINC00595 \n", + "5133 long intergenic non-protein coding RNA 595 LINC00595 \n", + "8675 uncharacterized LOC124900467 LOC124900467 \n", + "8676 uncharacterized CH507-42P11.6 CH507-42P11.6 \n", + "12016 uncharacterized LOC124905527 LOC124905527 \n", + "12017 uncharacterized LOC124905312 LOC124905312 \n", + "12018 uncharacterized LOC124905468 LOC124905468 \n", + "43337 uncharacterized LOC105377683 LOC105377683 \n", + "43338 uncharacterized LOC285626 LOC285626 \n", + "43519 golgin A6 family like 3, pseudogene GOLGA6L3P \n", + "43520 golgin A6 family like 17, pseudogene GOLGA6L17P \n", + "63071 uncharacterized LOC101928626 LOC101928626 \n", + "63072 uncharacterized LOC124901156 LOC124901156 \n", + "63715 putative killer cell immunoglobulin-like recep... LOC128966722 \n", + "63716 putative killer cell immunoglobulin-like recep... LOC128966730 \n", + "63717 putative killer cell immunoglobulin-like recep... LOC128966732 \n", + "63718 putative killer cell immunoglobulin-like recep... LOC128966731 \n", + "63719 putative killer cell immunoglobulin-like recep... LOC128966733 \n", "\n", - "[614 rows x 11 columns]" + " type_of_gene alias summary notfound \n", + "4089 ncRNA [] NaN NaN \n", + "4090 ncRNA [] NaN NaN \n", + "5130 ncRNA [C10orf101] NaN NaN \n", + "5131 ncRNA [C10orf101] NaN NaN \n", + "5132 ncRNA [C10orf101] NaN NaN \n", + "5133 ncRNA [C10orf101] NaN NaN \n", + "8675 protein-coding [] NaN NaN \n", + "8676 ncRNA [] NaN NaN \n", + "12016 ncRNA [] NaN NaN \n", + "12017 ncRNA [] NaN NaN \n", + "12018 ncRNA [] NaN NaN \n", + "43337 ncRNA [] NaN NaN \n", + "43338 ncRNA [] NaN NaN \n", + "43519 pseudo [GOLGA6L3] NaN NaN \n", + "43520 pseudo [GOLGA6L21P] NaN NaN \n", + "63071 ncRNA [] NaN NaN \n", + "63072 ncRNA [] NaN NaN \n", + "63715 protein-coding [] NaN NaN \n", + "63716 protein-coding [] NaN NaN \n", + "63717 protein-coding [] NaN NaN \n", + "63718 protein-coding [] NaN NaN \n", + "63719 protein-coding [] NaN NaN " ] }, "execution_count": 12, @@ -1226,7 +1391,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "29 duplicated genes have been processed.\n" + "8 duplicated genes have been processed.\n" ] }, { @@ -1265,49 +1430,35 @@ " \n", " \n", " \n", - " 66803\n", - " ENSG00000284181\n", + " 64998\n", + " ENSG00000277936\n", " \n", " NaN\n", - " 124905410\n", + " 84311\n", " 1.0\n", - " double homeobox protein 4\n", - " LOC124905410\n", + " mitochondrial ribosomal protein L45\n", + " MRPL45\n", " protein-coding\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", - " NaN\n", + " [MRP-L45, L45mt, Mba1, mL45]\n", + " Mammalian mitochondrial ribosomal proteins are...\n", " NaN\n", " \n", " \n", - " 66804\n", - " ENSG00000284262\n", + " 64999\n", + " ENSG00000277328\n", " \n", " NaN\n", - " 124906452\n", - " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", - " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", " NaN\n", " NaN\n", - " \n", - " \n", - " 66805\n", - " ENSG00000284496\n", - " \n", " NaN\n", - " 124906452\n", - " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", - " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", " NaN\n", " NaN\n", + " []\n", + " NaN\n", + " True\n", " \n", " \n", - " 66806\n", + " 65000\n", " ENSG00000287838\n", " 9\n", " \n", @@ -1321,86 +1472,100 @@ " NaN\n", " \n", " \n", - " 66807\n", - " ENSG00000284383\n", + " 65001\n", + " ENSG00000249738\n", + " 5\n", " \n", - " NaN\n", - " 124906452\n", - " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", - " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " 105377683\n", + " 1.0\n", + " uncharacterized LOC105377683\n", + " LOC105377683\n", + " ncRNA\n", + " [LOC285626]\n", " NaN\n", " NaN\n", " \n", " \n", - " 66808\n", - " ENSG00000283767\n", + " 65002\n", + " ENSG00000293331\n", + " 1\n", " \n", - " NaN\n", - " 124906452\n", + " 101928626\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", - " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " uncharacterized LOC101928626\n", + " LOC101928626\n", + " ncRNA\n", + " [LOC124901156]\n", " NaN\n", " NaN\n", " \n", " \n", - " 66809\n", - " ENSG00000283884\n", + " 65003\n", + " ENSG00000276518\n", " \n", " NaN\n", - " 124906452\n", + " 128966722\n", " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", - " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " putative killer cell immunoglobulin-like recep...\n", + " LOC128966722\n", + " protein-coding\n", + " [LOC128966731, LOC128966733, LOC128966730, LOC...\n", " NaN\n", " NaN\n", " \n", " \n", - " 66810\n", - " ENSG00000277660\n", - " \n", - " NaN\n", - " 124904108\n", + " 65004\n", + " ENSG00000230417\n", + " 10\n", + " LINC00595\n", + " 414243\n", " 1.0\n", - " U6 spliceosomal RNA\n", - " LOC124904108\n", - " snRNA\n", - " [LOC124906683]\n", + " long intergenic non-protein coding RNA 595\n", + " LINC00595\n", + " ncRNA\n", + " [LINC00595, C10orf101]\n", " NaN\n", " NaN\n", " \n", " \n", - " 66811\n", - " ENSG00000283955\n", + " 65005\n", + " ENSG00000278903\n", + " 21\n", " \n", + " 124905527\n", + " 1.0\n", + " uncharacterized LOC124905527\n", + " LOC124905527\n", + " ncRNA\n", + " [LOC124905468, LOC124905312]\n", " NaN\n", - " 124906452\n", - " 2.0\n", - " double homeobox protein 4\n", - " LOC124906452\n", + " NaN\n", + " \n", + " \n", + " 65006\n", + " ENSG00000230373\n", + " 15\n", + " GOLGA6L5P\n", + " 100133220\n", + " 1.0\n", + " golgin A6 family like 3, pseudogene\n", + " GOLGA6L3P\n", " pseudo\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " [GOLGA6L17P, GOLGA6L21P, GOLGA6L3]\n", " NaN\n", " NaN\n", " \n", " \n", - " 66812\n", - " ENSG00000275405\n", - " \n", - " NaN\n", - " 124905321\n", + " 65007\n", + " ENSG00000188660\n", + " 21\n", + " LINC00319\n", + " 124900467\n", " 1.0\n", - " U1 spliceosomal RNA\n", - " LOC124905321\n", - " snRNA\n", - " [LOC124904613, LOC124905809, LOC124905573]\n", + " uncharacterized LOC124900467\n", + " LOC124900467\n", + " protein-coding\n", + " [CH507-42P11.6]\n", " NaN\n", " NaN\n", " \n", @@ -1409,41 +1574,53 @@ "" ], "text/plain": [ - " ensembl_gene_id chromosome_name hgnc_symbol _id _version \\\n", - "66803 ENSG00000284181 NaN 124905410 1.0 \n", - "66804 ENSG00000284262 NaN 124906452 2.0 \n", - "66805 ENSG00000284496 NaN 124906452 2.0 \n", - "66806 ENSG00000287838 9 101927042 1.0 \n", - "66807 ENSG00000284383 NaN 124906452 2.0 \n", - "66808 ENSG00000283767 NaN 124906452 2.0 \n", - "66809 ENSG00000283884 NaN 124906452 2.0 \n", - "66810 ENSG00000277660 NaN 124904108 1.0 \n", - "66811 ENSG00000283955 NaN 124906452 2.0 \n", - "66812 ENSG00000275405 NaN 124905321 1.0 \n", + " ensembl_gene_id chromosome_name hgnc_symbol _id _version \\\n", + "64998 ENSG00000277936 NaN 84311 1.0 \n", + "64999 ENSG00000277328 NaN NaN NaN \n", + "65000 ENSG00000287838 9 101927042 1.0 \n", + "65001 ENSG00000249738 5 105377683 1.0 \n", + "65002 ENSG00000293331 1 101928626 2.0 \n", + "65003 ENSG00000276518 NaN 128966722 2.0 \n", + "65004 ENSG00000230417 10 LINC00595 414243 1.0 \n", + "65005 ENSG00000278903 21 124905527 1.0 \n", + "65006 ENSG00000230373 15 GOLGA6L5P 100133220 1.0 \n", + "65007 ENSG00000188660 21 LINC00319 124900467 1.0 \n", "\n", - " name symbol type_of_gene \\\n", - "66803 double homeobox protein 4 LOC124905410 protein-coding \n", - "66804 double homeobox protein 4 LOC124906452 pseudo \n", - "66805 double homeobox protein 4 LOC124906452 pseudo \n", - "66806 uncharacterized LOC101927042 LOC101927042 ncRNA \n", - "66807 double homeobox protein 4 LOC124906452 pseudo \n", - "66808 double homeobox protein 4 LOC124906452 pseudo \n", - "66809 double homeobox protein 4 LOC124906452 pseudo \n", - "66810 U6 spliceosomal RNA LOC124904108 snRNA \n", - "66811 double homeobox protein 4 LOC124906452 pseudo \n", - "66812 U1 spliceosomal RNA LOC124905321 snRNA \n", + " name symbol \\\n", + "64998 mitochondrial ribosomal protein L45 MRPL45 \n", + "64999 NaN NaN \n", + "65000 uncharacterized LOC101927042 LOC101927042 \n", + "65001 uncharacterized LOC105377683 LOC105377683 \n", + "65002 uncharacterized LOC101928626 LOC101928626 \n", + "65003 putative killer cell immunoglobulin-like recep... LOC128966722 \n", + "65004 long intergenic non-protein coding RNA 595 LINC00595 \n", + "65005 uncharacterized LOC124905527 LOC124905527 \n", + "65006 golgin A6 family like 3, pseudogene GOLGA6L3P \n", + "65007 uncharacterized LOC124900467 LOC124900467 \n", "\n", - " alias summary notfound \n", - "66803 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66804 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66805 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66806 [LOC124902157] NaN NaN \n", - "66807 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66808 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66809 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66810 [LOC124906683] NaN NaN \n", - "66811 [LOC124906461, LOC124906459, LOC124906465, LOC... NaN NaN \n", - "66812 [LOC124904613, LOC124905809, LOC124905573] NaN NaN " + " type_of_gene alias \\\n", + "64998 protein-coding [MRP-L45, L45mt, Mba1, mL45] \n", + "64999 NaN [] \n", + "65000 ncRNA [LOC124902157] \n", + "65001 ncRNA [LOC285626] \n", + "65002 ncRNA [LOC124901156] \n", + "65003 protein-coding [LOC128966731, LOC128966733, LOC128966730, LOC... \n", + "65004 ncRNA [LINC00595, C10orf101] \n", + "65005 ncRNA [LOC124905468, LOC124905312] \n", + "65006 pseudo [GOLGA6L17P, GOLGA6L21P, GOLGA6L3] \n", + "65007 protein-coding [CH507-42P11.6] \n", + "\n", + " summary notfound \n", + "64998 Mammalian mitochondrial ribosomal proteins are... NaN \n", + "64999 NaN True \n", + "65000 NaN NaN \n", + "65001 NaN NaN \n", + "65002 NaN NaN \n", + "65003 NaN NaN \n", + "65004 NaN NaN \n", + "65005 NaN NaN \n", + "65006 NaN NaN \n", + "65007 NaN NaN " ] }, "execution_count": 13, @@ -1453,7 +1630,7 @@ ], "source": [ "non_dupes = set(gene_table_merged.index) - set(all_duplicated.index)\n", - "keep_df = gene_table_merged.loc[non_dupes].copy(deep=True)\n", + "keep_df = gene_table_merged.loc[list(non_dupes)].copy(deep=True)\n", "\n", "# For each duplicated Ensembl ID, collapse to 1 row and append that row to keep_df\n", "for ens_id in set(all_duplicated[\"ensembl_gene_id\"]):\n", @@ -1483,7 +1660,7 @@ " group.at[group.index[0], \"alias\"] = list(set(group.at[group.index[0], \"alias\"]))\n", "\n", " # Keep the first row only, which now has all the aliases\n", - " keep_df = keep_df.append(group.iloc[0], ignore_index=True)\n", + " keep_df = pd.concat([keep_df, group.iloc[0].to_frame().T], ignore_index=True)\n", "\n", "print(\n", " str(len(all_duplicated.drop_duplicates(\"ensembl_gene_id\")))\n", @@ -1596,7 +1773,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "66813\n", + "65008\n", "Querying genes 1 - 1000\n", "Querying genes 1001 - 2000\n", "Querying genes 2001 - 3000\n", @@ -1662,9 +1839,8 @@ "Querying genes 62001 - 63000\n", "Querying genes 63001 - 64000\n", "Querying genes 64001 - 65000\n", - "Querying genes 65001 - 66000\n", - "Querying genes 66001 - 66813\n", - "66806\n" + "Querying genes 65001 - 65008\n", + "65008\n" ] }, { @@ -1688,96 +1864,96 @@ " \n", " \n", " \n", - " type\n", - " release\n", - " peptide\n", - " version\n", " assembly\n", + " peptide\n", + " release\n", " latest\n", + " possible_replacement\n", + " version\n", " id\n", + " type\n", " is_current\n", - " possible_replacement\n", " \n", " \n", " \n", " \n", - " 66801\n", - " Gene\n", - " 111\n", - " None\n", - " 2\n", + " 65003\n", " GRCh38\n", - " ENSG00000283767.2\n", - " ENSG00000283767\n", - " 1\n", + " None\n", + " 111\n", + " ENSG00000276518.1\n", " []\n", - " \n", - " \n", - " 66802\n", + " 1\n", + " ENSG00000276518\n", " Gene\n", - " 111\n", - " None\n", - " 2\n", - " GRCh38\n", - " ENSG00000283884.2\n", - " ENSG00000283884\n", " 1\n", - " []\n", " \n", " \n", - " 66803\n", - " Gene\n", - " 111\n", - " None\n", - " 1\n", + " 65004\n", " GRCh38\n", - " ENSG00000277660.1\n", - " ENSG00000277660\n", - " 1\n", + " None\n", + " 111\n", + " ENSG00000230417.12\n", " []\n", + " 12\n", + " ENSG00000230417\n", + " Gene\n", + " 1\n", " \n", " \n", - " 66804\n", - " Gene\n", - " 111\n", - " None\n", - " 2\n", + " 65005\n", " GRCh38\n", - " ENSG00000283955.2\n", - " ENSG00000283955\n", - " 1\n", + " None\n", + " 111\n", + " ENSG00000278903.5\n", " []\n", + " 5\n", + " ENSG00000278903\n", + " Gene\n", + " 1\n", " \n", " \n", - " 66805\n", - " Gene\n", - " 111\n", + " 65006\n", + " GRCh38\n", " None\n", + " 111\n", + " ENSG00000230373.9\n", + " []\n", + " 9\n", + " ENSG00000230373\n", + " Gene\n", " 1\n", + " \n", + " \n", + " 65007\n", " GRCh38\n", - " ENSG00000275405.1\n", - " ENSG00000275405\n", - " 1\n", + " None\n", + " 111\n", + " ENSG00000188660.5\n", " []\n", + " 5\n", + " ENSG00000188660\n", + " Gene\n", + " 1\n", " \n", " \n", "\n", "" ], "text/plain": [ - " type release peptide version assembly latest \\\n", - "66801 Gene 111 None 2 GRCh38 ENSG00000283767.2 \n", - "66802 Gene 111 None 2 GRCh38 ENSG00000283884.2 \n", - "66803 Gene 111 None 1 GRCh38 ENSG00000277660.1 \n", - "66804 Gene 111 None 2 GRCh38 ENSG00000283955.2 \n", - "66805 Gene 111 None 1 GRCh38 ENSG00000275405.1 \n", + " assembly peptide release latest possible_replacement \\\n", + "65003 GRCh38 None 111 ENSG00000276518.1 [] \n", + "65004 GRCh38 None 111 ENSG00000230417.12 [] \n", + "65005 GRCh38 None 111 ENSG00000278903.5 [] \n", + "65006 GRCh38 None 111 ENSG00000230373.9 [] \n", + "65007 GRCh38 None 111 ENSG00000188660.5 [] \n", "\n", - " id is_current possible_replacement \n", - "66801 ENSG00000283767 1 [] \n", - "66802 ENSG00000283884 1 [] \n", - "66803 ENSG00000277660 1 [] \n", - "66804 ENSG00000283955 1 [] \n", - "66805 ENSG00000275405 1 [] " + " version id type is_current \n", + "65003 1 ENSG00000276518 Gene 1 \n", + "65004 12 ENSG00000230417 Gene 1 \n", + "65005 5 ENSG00000278903 Gene 1 \n", + "65006 9 ENSG00000230373 Gene 1 \n", + "65007 5 ENSG00000188660 Gene 1 " ] }, "execution_count": 15, @@ -1847,31 +2023,29 @@ "release\n", "100 22\n", "101 8\n", - "102 24\n", + "102 16\n", "103 15\n", "104 19\n", "105 9\n", "106 34\n", "107 10\n", "108 4\n", - "109 5\n", + "109 4\n", "110 11\n", - "111 65595\n", + "111 63843\n", "80 21\n", "81 2\n", "82 10\n", "84 673\n", - "85 1\n", "87 61\n", "89 20\n", "91 75\n", - "92 3\n", "93 53\n", "95 33\n", - "96 36\n", + "96 31\n", "97 18\n", "98 9\n", - "99 35\n", + "99 7\n", "dtype: int64" ] }, @@ -1896,16 +2070,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "66806\n", - "66813\n", - "False\n" + "65008\n", + "65008\n", + "True\n" ] } ], "source": [ "# Check that all IDs are the same between the result and the gene table\n", - "# NOTE: After addition of biodomains, this is no longer true. There are 7 genes from the biodomains\n", - "# dataset that are not in the Ensembl database or archives. \n", "print(len(versions[\"id\"]))\n", "print(len(gene_table_merged))\n", "print(\n", @@ -1948,7 +2120,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "0d5b5652", "metadata": { "scrolled": true @@ -1980,24 +2152,24 @@ "data": { "text/plain": [ "closest_release\n", - "80 919\n", + "80 915\n", "95 33\n", - "96 36\n", + "96 31\n", "97 18\n", "98 9\n", - "99 35\n", + "99 7\n", "100 22\n", "101 8\n", - "102 24\n", + "102 16\n", "103 15\n", "104 19\n", "105 9\n", "106 34\n", "107 10\n", "108 4\n", - "109 5\n", + "109 4\n", "110 11\n", - "111 65595\n", + "111 63843\n", "dtype: int64" ] }, @@ -2049,15 +2221,15 @@ " \n", " \n", " \n", - " type\n", - " release\n", - " peptide\n", - " version\n", " assembly\n", + " peptide\n", + " release\n", " latest\n", + " possible_replacement\n", + " version\n", " id\n", + " type\n", " is_current\n", - " possible_replacement\n", " closest_release\n", " permalink\n", " \n", @@ -2065,71 +2237,71 @@ " \n", " \n", " 0\n", - " Gene\n", - " 111\n", - " None\n", - " 1\n", " GRCh38\n", + " None\n", + " 111\n", " ENSG00000210049.1\n", + " []\n", + " 1\n", " ENSG00000210049\n", + " Gene\n", " 1\n", - " []\n", " 111\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 1\n", - " Gene\n", - " 111\n", - " None\n", - " 2\n", " GRCh38\n", + " None\n", + " 111\n", " ENSG00000211459.2\n", + " []\n", + " 2\n", " ENSG00000211459\n", + " Gene\n", " 1\n", - " []\n", " 111\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 2\n", - " Gene\n", - " 111\n", - " None\n", - " 1\n", " GRCh38\n", + " None\n", + " 111\n", " ENSG00000210077.1\n", + " []\n", + " 1\n", " ENSG00000210077\n", + " Gene\n", " 1\n", - " []\n", " 111\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 3\n", - " Gene\n", - " 111\n", - " None\n", - " 2\n", " GRCh38\n", + " None\n", + " 111\n", " ENSG00000210082.2\n", + " []\n", + " 2\n", " ENSG00000210082\n", + " Gene\n", " 1\n", - " []\n", " 111\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", " 4\n", - " Gene\n", - " 111\n", - " None\n", - " 1\n", " GRCh38\n", + " None\n", + " 111\n", " ENSG00000209082.1\n", + " []\n", + " 1\n", " ENSG00000209082\n", + " Gene\n", " 1\n", - " []\n", " 111\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", @@ -2138,19 +2310,19 @@ "" ], "text/plain": [ - " type release peptide version assembly latest id \\\n", - "0 Gene 111 None 1 GRCh38 ENSG00000210049.1 ENSG00000210049 \n", - "1 Gene 111 None 2 GRCh38 ENSG00000211459.2 ENSG00000211459 \n", - "2 Gene 111 None 1 GRCh38 ENSG00000210077.1 ENSG00000210077 \n", - "3 Gene 111 None 2 GRCh38 ENSG00000210082.2 ENSG00000210082 \n", - "4 Gene 111 None 1 GRCh38 ENSG00000209082.1 ENSG00000209082 \n", + " assembly peptide release latest possible_replacement version \\\n", + "0 GRCh38 None 111 ENSG00000210049.1 [] 1 \n", + "1 GRCh38 None 111 ENSG00000211459.2 [] 2 \n", + "2 GRCh38 None 111 ENSG00000210077.1 [] 1 \n", + "3 GRCh38 None 111 ENSG00000210082.2 [] 2 \n", + "4 GRCh38 None 111 ENSG00000209082.1 [] 1 \n", "\n", - " is_current possible_replacement closest_release \\\n", - "0 1 [] 111 \n", - "1 1 [] 111 \n", - "2 1 [] 111 \n", - "3 1 [] 111 \n", - "4 1 [] 111 \n", + " id type is_current closest_release \\\n", + "0 ENSG00000210049 Gene 1 111 \n", + "1 ENSG00000211459 Gene 1 111 \n", + "2 ENSG00000210077 Gene 1 111 \n", + "3 ENSG00000210082 Gene 1 111 \n", + "4 ENSG00000209082 Gene 1 111 \n", "\n", " permalink \n", "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", @@ -2206,87 +2378,87 @@ " \n", " \n", " \n", - " type\n", - " release\n", - " peptide\n", - " version\n", " assembly\n", + " peptide\n", + " release\n", " latest\n", + " possible_replacement\n", + " version\n", " id\n", + " type\n", " is_current\n", - " possible_replacement\n", " closest_release\n", " permalink\n", " \n", " \n", " \n", " \n", - " 63184\n", - " Gene\n", - " 93\n", + " 63180\n", + " GRCh38\n", " None\n", + " 84\n", + " ENSG00000238909.1\n", + " []\n", " 1\n", - " GRCh38\n", - " ENSG00000260977.1\n", - " ENSG00000260977\n", + " ENSG00000238909\n", + " Gene\n", " \n", - " []\n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 63187\n", - " Gene\n", - " 84\n", + " 63181\n", + " GRCh38\n", " None\n", + " 84\n", + " ENSG00000265155.1\n", + " []\n", " 1\n", - " GRCh38\n", - " ENSG00000280788.1\n", - " ENSG00000280788\n", + " ENSG00000265155\n", + " Gene\n", " \n", - " []\n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 63190\n", - " Gene\n", - " 84\n", - " None\n", - " 2\n", + " 63183\n", " GRCh38\n", - " ENSG00000222197.2\n", - " ENSG00000222197\n", - " \n", + " None\n", + " 84\n", + " ENSG00000275447.1\n", " []\n", + " 1\n", + " ENSG00000275447\n", + " Gene\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 63192\n", - " Gene\n", - " 84\n", - " None\n", - " 2\n", + " 63184\n", " GRCh38\n", - " ENSG00000265212.2\n", - " ENSG00000265212\n", - " \n", + " None\n", + " 84\n", + " ENSG00000263623.1\n", " []\n", + " 1\n", + " ENSG00000263623\n", + " Gene\n", + " \n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 63200\n", - " Gene\n", - " 87\n", + " 63190\n", + " GRCh38\n", " None\n", + " 84\n", + " ENSG00000238644.1\n", + " []\n", " 1\n", - " GRCh38\n", - " ENSG00000279921.1\n", - " ENSG00000279921\n", + " ENSG00000238644\n", + " Gene\n", " \n", - " []\n", " 80\n", " https://may2015.archive.ensembl.org/Homo_sapie...\n", " \n", @@ -2295,26 +2467,26 @@ "" ], "text/plain": [ - " type release peptide version assembly latest \\\n", - "63184 Gene 93 None 1 GRCh38 ENSG00000260977.1 \n", - "63187 Gene 84 None 1 GRCh38 ENSG00000280788.1 \n", - "63190 Gene 84 None 2 GRCh38 ENSG00000222197.2 \n", - "63192 Gene 84 None 2 GRCh38 ENSG00000265212.2 \n", - "63200 Gene 87 None 1 GRCh38 ENSG00000279921.1 \n", + " assembly peptide release latest possible_replacement \\\n", + "63180 GRCh38 None 84 ENSG00000238909.1 [] \n", + "63181 GRCh38 None 84 ENSG00000265155.1 [] \n", + "63183 GRCh38 None 84 ENSG00000275447.1 [] \n", + "63184 GRCh38 None 84 ENSG00000263623.1 [] \n", + "63190 GRCh38 None 84 ENSG00000238644.1 [] \n", "\n", - " id is_current possible_replacement closest_release \\\n", - "63184 ENSG00000260977 [] 80 \n", - "63187 ENSG00000280788 [] 80 \n", - "63190 ENSG00000222197 [] 80 \n", - "63192 ENSG00000265212 [] 80 \n", - "63200 ENSG00000279921 [] 80 \n", + " version id type is_current closest_release \\\n", + "63180 1 ENSG00000238909 Gene 80 \n", + "63181 1 ENSG00000265155 Gene 80 \n", + "63183 1 ENSG00000275447 Gene 80 \n", + "63184 1 ENSG00000263623 Gene 80 \n", + "63190 1 ENSG00000238644 Gene 80 \n", "\n", " permalink \n", + "63180 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "63181 https://may2015.archive.ensembl.org/Homo_sapie... \n", + "63183 https://may2015.archive.ensembl.org/Homo_sapie... \n", "63184 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "63187 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "63190 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "63192 https://may2015.archive.ensembl.org/Homo_sapie... \n", - "63200 https://may2015.archive.ensembl.org/Homo_sapie... " + "63190 https://may2015.archive.ensembl.org/Homo_sapie... " ] }, "execution_count": 22, @@ -2387,7 +2559,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(66813, 14)\n" + "(65008, 14)\n" ] }, { @@ -2518,12 +2690,12 @@ "" ], "text/plain": [ - " ensembl_gene_id chromosome_name hgnc_symbol _id _version name \\\n", - "0 ENSG00000210049 MT MT-TF 4558 2.0 tRNA-Phe \n", - "1 ENSG00000211459 MT MT-RNR1 4549 2.0 s-rRNA \n", - "2 ENSG00000210077 MT MT-TV 4577 2.0 tRNA-Val \n", - "3 ENSG00000210082 MT MT-RNR2 4550 2.0 l-rRNA \n", - "4 ENSG00000209082 MT MT-TL1 4567 2.0 tRNA-Leu \n", + " ensembl_gene_id chromosome_name hgnc_symbol _id _version name \\\n", + "0 ENSG00000210049 MT MT-TF 4558 2.0 tRNA-Phe \n", + "1 ENSG00000211459 MT MT-RNR1 4549 2.0 s-rRNA \n", + "2 ENSG00000210077 MT MT-TV 4577 2.0 tRNA-Val \n", + "3 ENSG00000210082 MT MT-RNR2 4550 2.0 l-rRNA \n", + "4 ENSG00000209082 MT MT-TL1 4567 2.0 tRNA-Leu \n", "\n", " symbol type_of_gene alias \\\n", "0 TRNF tRNA [] \n", @@ -2695,83 +2867,83 @@ " ...\n", " \n", " \n", - " 66808\n", - " ENSG00000283767\n", - " double homeobox protein 4\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " 65003\n", + " ENSG00000276518\n", + " putative killer cell immunoglobulin-like recep...\n", + " [LOC128966731, LOC128966733, LOC128966730, LOC...\n", " NaN\n", - " LOC124906452\n", - " pseudo\n", + " LOC128966722\n", + " protein-coding\n", " 111\n", " []\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 66809\n", - " ENSG00000283884\n", - " double homeobox protein 4\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " 65004\n", + " ENSG00000230417\n", + " long intergenic non-protein coding RNA 595\n", + " [LINC00595, C10orf101]\n", " NaN\n", - " LOC124906452\n", - " pseudo\n", + " LINC00595\n", + " ncRNA\n", " 111\n", " []\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 66810\n", - " ENSG00000277660\n", - " U6 spliceosomal RNA\n", - " [LOC124906683]\n", + " 65005\n", + " ENSG00000278903\n", + " uncharacterized LOC124905527\n", + " [LOC124905468, LOC124905312]\n", " NaN\n", - " LOC124904108\n", - " snRNA\n", + " LOC124905527\n", + " ncRNA\n", " 111\n", " []\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 66811\n", - " ENSG00000283955\n", - " double homeobox protein 4\n", - " [LOC124906461, LOC124906459, LOC124906465, LOC...\n", + " 65006\n", + " ENSG00000230373\n", + " golgin A6 family like 3, pseudogene\n", + " [GOLGA6L17P, GOLGA6L21P, GOLGA6L3]\n", " NaN\n", - " LOC124906452\n", + " GOLGA6L3P\n", " pseudo\n", " 111\n", " []\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", - " 66812\n", - " ENSG00000275405\n", - " U1 spliceosomal RNA\n", - " [LOC124904613, LOC124905809, LOC124905573]\n", + " 65007\n", + " ENSG00000188660\n", + " uncharacterized LOC124900467\n", + " [CH507-42P11.6]\n", " NaN\n", - " LOC124905321\n", - " snRNA\n", + " LOC124900467\n", + " protein-coding\n", " 111\n", " []\n", " https://jan2024.archive.ensembl.org/Homo_sapie...\n", " \n", " \n", "\n", - "

66813 rows × 9 columns

\n", + "

65008 rows × 9 columns

\n", "" ], "text/plain": [ - " ensembl_gene_id name \\\n", - "0 ENSG00000210049 tRNA-Phe \n", - "1 ENSG00000211459 s-rRNA \n", - "2 ENSG00000210077 tRNA-Val \n", - "3 ENSG00000210082 l-rRNA \n", - "4 ENSG00000209082 tRNA-Leu \n", - "... ... ... \n", - "66808 ENSG00000283767 double homeobox protein 4 \n", - "66809 ENSG00000283884 double homeobox protein 4 \n", - "66810 ENSG00000277660 U6 spliceosomal RNA \n", - "66811 ENSG00000283955 double homeobox protein 4 \n", - "66812 ENSG00000275405 U1 spliceosomal RNA \n", + " ensembl_gene_id name \\\n", + "0 ENSG00000210049 tRNA-Phe \n", + "1 ENSG00000211459 s-rRNA \n", + "2 ENSG00000210077 tRNA-Val \n", + "3 ENSG00000210082 l-rRNA \n", + "4 ENSG00000209082 tRNA-Leu \n", + "... ... ... \n", + "65003 ENSG00000276518 putative killer cell immunoglobulin-like recep... \n", + "65004 ENSG00000230417 long intergenic non-protein coding RNA 595 \n", + "65005 ENSG00000278903 uncharacterized LOC124905527 \n", + "65006 ENSG00000230373 golgin A6 family like 3, pseudogene \n", + "65007 ENSG00000188660 uncharacterized LOC124900467 \n", "\n", " alias \\\n", "0 [] \n", @@ -2780,11 +2952,11 @@ "3 [MTRNR2] \n", "4 [MTTL1] \n", "... ... \n", - "66808 [LOC124906461, LOC124906459, LOC124906465, LOC... \n", - "66809 [LOC124906461, LOC124906459, LOC124906465, LOC... \n", - "66810 [LOC124906683] \n", - "66811 [LOC124906461, LOC124906459, LOC124906465, LOC... \n", - "66812 [LOC124904613, LOC124905809, LOC124905573] \n", + "65003 [LOC128966731, LOC128966733, LOC128966730, LOC... \n", + "65004 [LINC00595, C10orf101] \n", + "65005 [LOC124905468, LOC124905312] \n", + "65006 [GOLGA6L17P, GOLGA6L21P, GOLGA6L3] \n", + "65007 [CH507-42P11.6] \n", "\n", " summary symbol \\\n", "0 NaN TRNF \n", @@ -2793,24 +2965,24 @@ "3 Enables G protein-coupled receptor binding act... RNR2 \n", "4 Implicated in cardiomyopathy. [provided by All... TRNL1 \n", "... ... ... \n", - "66808 NaN LOC124906452 \n", - "66809 NaN LOC124906452 \n", - "66810 NaN LOC124904108 \n", - "66811 NaN LOC124906452 \n", - "66812 NaN LOC124905321 \n", + "65003 NaN LOC128966722 \n", + "65004 NaN LINC00595 \n", + "65005 NaN LOC124905527 \n", + "65006 NaN GOLGA6L3P \n", + "65007 NaN LOC124900467 \n", "\n", - " type_of_gene ensembl_release possible_replacement \\\n", - "0 tRNA 111 [] \n", - "1 rRNA 111 [] \n", - "2 tRNA 111 [] \n", - "3 rRNA 111 [] \n", - "4 tRNA 111 [] \n", - "... ... ... ... \n", - "66808 pseudo 111 [] \n", - "66809 pseudo 111 [] \n", - "66810 snRNA 111 [] \n", - "66811 pseudo 111 [] \n", - "66812 snRNA 111 [] \n", + " type_of_gene ensembl_release possible_replacement \\\n", + "0 tRNA 111 [] \n", + "1 rRNA 111 [] \n", + "2 tRNA 111 [] \n", + "3 rRNA 111 [] \n", + "4 tRNA 111 [] \n", + "... ... ... ... \n", + "65003 protein-coding 111 [] \n", + "65004 ncRNA 111 [] \n", + "65005 ncRNA 111 [] \n", + "65006 pseudo 111 [] \n", + "65007 protein-coding 111 [] \n", "\n", " permalink \n", "0 https://jan2024.archive.ensembl.org/Homo_sapie... \n", @@ -2819,13 +2991,13 @@ "3 https://jan2024.archive.ensembl.org/Homo_sapie... \n", "4 https://jan2024.archive.ensembl.org/Homo_sapie... \n", "... ... \n", - "66808 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "66809 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "66810 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "66811 https://jan2024.archive.ensembl.org/Homo_sapie... \n", - "66812 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "65003 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "65004 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "65005 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "65006 https://jan2024.archive.ensembl.org/Homo_sapie... \n", + "65007 https://jan2024.archive.ensembl.org/Homo_sapie... \n", "\n", - "[66813 rows x 9 columns]" + "[65008 rows x 9 columns]" ] }, "execution_count": 26, diff --git a/test_config.yaml b/test_config.yaml index 879a6d1d..0341cfca 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -135,7 +135,7 @@ datasets: - gene_info: files: - name: gene_metadata - id: syn25953363.10 + id: syn25953363.11 format: feather - name: igap id: syn12514826.5 @@ -178,7 +178,7 @@ datasets: possible_replacement: ensembl_possible_replacements permalink: ensembl_permalink provenance: - - syn25953363.10 + - syn25953363.11 - syn12514826.5 - syn12514912.3 - *agora_proteomics_provenance From 699defe432cf4ecbcf98cabf47bd369e0a7aad4e Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Tue, 20 Feb 2024 20:17:29 -0800 Subject: [PATCH 4/7] Moved biodomain split operation before grouping operation in gene_info transform --- src/agoradatatools/etl/transform/gene_info.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 62a06d8d..769019e8 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -82,6 +82,10 @@ def transform_gene_info( ) biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"]) + biodomains = split_delimited_field_to_multiple_rows( + df=biodomains, split_field="ensembl_gene_id", delim=";" + ) + biodomains = ( biodomains.groupby("ensembl_gene_id")["biodomain"] .apply(set) # ensure unique biodomain names @@ -90,10 +94,6 @@ def transform_gene_info( .rename(columns={"biodomain": "biodomains"}) ) - biodomains = split_delimited_field_to_multiple_rows( - df=biodomains, split_field="ensembl_gene_id", delim=";" - ) - # sort biodomains list alphabetically biodomains["biodomains"] = biodomains["biodomains"].apply(sorted) From 4d5baef2f9949fe431663397d91063735a42d2c5 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 21 Feb 2024 10:30:53 -0800 Subject: [PATCH 5/7] Added clarification to the docstring for the new util function --- src/agoradatatools/etl/utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/agoradatatools/etl/utils.py b/src/agoradatatools/etl/utils.py index b228ea55..13e57e92 100644 --- a/src/agoradatatools/etl/utils.py +++ b/src/agoradatatools/etl/utils.py @@ -225,10 +225,16 @@ def split_delimited_field_to_multiple_rows( function creates duplicate rows for each item in that list, with identical data in the other columns. Then the target column for these duplicate rows (plus the original row) is assigned a single value from the list, resulting in one row per item in the former list. - An example of where this function is needed: the genes_biodomains dataset has some semicolon-separated Ensembl IDs - in its ensembl_gene_id field, in addition to rows with a single Ensembl ID in the field. For rows with a list of - Ensembl IDs, the field is split on ";" and the function creates duplicate rows for each Ensembl ID in the list. Then - the ensembl_gene_id field for these duplicates is re-assigned so that there is one Ensembl ID per row. + For example, an input data frame that looks like this: + biodomain go_term ensembl_id + Apoptosis GO:123 ENSG0001 + Synapse GO:456 ENSG0002;ENSG0003;ENSG0004 + will come out looking like this: + biodomain go_term ensembl_id + Apoptosis GO:123 ENSG0001 + Synapse GO:456 ENSG0002 + Synapse GO:456 ENSG0003 + Synapse GO:456 ENSG0004 Args: df (pd.DataFrame): the DataFrame containing a column with delimiter-separated strings. The column can contain a From 2327fbc4ecc49feab180adc8a5a1098b4c0ef28a Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 21 Feb 2024 10:47:49 -0800 Subject: [PATCH 6/7] Added test for null values for the split delimited function --- tests/test_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9f3dbe25..493e5296 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -434,9 +434,22 @@ def test_split_delimited_field_to_multiple_rows_with_regex_delim(self) -> None: assert output.equals(self.expected_df) def test_split_delimited_field_to_multiple_rows_with_no_split(self) -> None: - input_df = self.expected_df.copy() + input_df = self.expected_df.copy(deep=True) output = utils.split_delimited_field_to_multiple_rows( df=input_df, split_field="col_1", delim=";" ) assert output.equals(self.expected_df) + + def test_split_delimited_field_to_multiple_rows_type_error(self) -> None: + input_df = pd.DataFrame( + { + "col_1": ["a", None], + "col_2": ["x", "y"], + "col_3": ["1", "2"], + } + ) + with pytest.raises(TypeError, match="has no len()"): + utils.split_delimited_field_to_multiple_rows( + df=input_df, split_field="col_1", delim=";" + ) From e6c30d4a11f5c7cb628b9599c4a3179e53872e00 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 21 Feb 2024 12:34:47 -0800 Subject: [PATCH 7/7] Changed resource url variables to uppercase to indicate constants --- src/agoradatatools/etl/transform/gene_info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 769019e8..88ab7c4d 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -102,16 +102,16 @@ def transform_gene_info( # tep_info file and not the symbol in gene_info, because there are some mismatches # between the two and the hgnc_symbol from tep_info is the correct one to use here. # resource_url should be NA if both is_adi and is_tep are false. - resource_url_prefix = ( + RESOURCE_URL_PREFIX = ( "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22" + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22" + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table." + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" ) - resource_url_suffix = "%22%5D%7D%5D%7D" + RESOURCE_URL_SUFFIX = "%22%5D%7D%5D%7D" tep_info["resource_url"] = tep_info.apply( lambda row: ( - resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix + RESOURCE_URL_PREFIX + row["hgnc_symbol"] + RESOURCE_URL_SUFFIX if row["is_adi"] or row["is_tep"] else np.NaN ),