diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index bc30dcd4..55ede085 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -5,8 +5,8 @@ def transform_gene_info( - datasets: dict, adjusted_p_value_threshold, protein_level_threshold -): + datasets: dict, adjusted_p_value_threshold: float, protein_level_threshold: float +) -> pd.DataFrame: """ This function will perform transformations and incrementally create a dataset called gene_info. Each dataset will be left_joined onto gene_info, starting with gene_metadata. @@ -81,6 +81,7 @@ def transform_gene_info( drop_columns=["ensembl_gene_id"], ) + biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"]) biodomains = ( biodomains.groupby("ensembl_gene_id")["biodomain"] .apply(set) # ensure unique biodomain names @@ -92,34 +93,39 @@ def transform_gene_info( # sort biodomains list alphabetically biodomains["biodomains"] = biodomains["biodomains"].apply(sorted) + # Type-check the 'is_adi' and 'is_tep' columns of tep_info to make sure they are booleans and not strings. + # Explicitly make NaN is_adi and is_tep values "False" to avoid having to check for boolean and NaN in the + # check below. + tep_info = tep_info.fillna({"is_adi": False, "is_tep": False}) + if tep_info["is_adi"].dtype != bool: + raise TypeError( + f"'is_adi' column must be 'bool', current type is {tep_info['is_adi'].dtype}" + ) + if tep_info["is_tep"].dtype != bool: + raise TypeError( + f"'is_tep' column must be 'bool', current type is {tep_info['is_tep'].dtype}" + ) + # For genes with either is_adi or is_tep set to True, create a resource URL that opens # the portal page to the specific gene. This must be done using the hgnc_symbol from the # tep_info file and not the symbol in gene_info, because there are some mismatches # between the two and the hgnc_symbol from tep_info is the correct one to use here. # resource_url should be NA if both is_adi and is_tep are false. - resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" - resource_url_suffix = "%22%5D%7D%5D%7D" - tep_info["resource_url"] = tep_info.apply( - lambda row: resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix - if row["is_adi"] or row["is_tep"] - else np.NaN, - axis=1, + RESOURCE_URL_PREFIX = ( + "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22" + + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22" + + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table." + + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" ) + RESOURCE_URL_SUFFIX = "%22%5D%7D%5D%7D" - ensembl_info = gene_metadata[ - [ - "ensembl_gene_id", - "ensembl_release", - "ensembl_possible_replacements", - "ensembl_permalink", - ] - ] - ensembl_info = nest_fields( - df=ensembl_info, - grouping="ensembl_gene_id", - new_column="ensembl_info", - drop_columns=["ensembl_gene_id"], - nested_field_is_list=False, + tep_info["resource_url"] = tep_info.apply( + lambda row: ( + RESOURCE_URL_PREFIX + row["hgnc_symbol"] + RESOURCE_URL_SUFFIX + if row["is_adi"] is True or row["is_tep"] is True + else np.NaN + ), + axis=1, ) # Merge all the datasets @@ -135,7 +141,6 @@ def transform_gene_info( druggability, biodomains, tep_info, - ensembl_info, ]: gene_info = pd.merge( left=gene_info, @@ -159,12 +164,41 @@ def transform_gene_info( inplace=True, ) - # fillna doesn't work for creating an empty array, need this function instead - gene_info["alias"] = gene_info.apply( - lambda row: row["alias"] - if isinstance(row["alias"], np.ndarray) - else np.ndarray(0, dtype=object), - axis=1, + # fillna doesn't work for creating an empty array, need this function instead for alias and possible replacements + gene_info["alias"] = gene_info["alias"].apply( + lambda row: row if isinstance(row, np.ndarray) else np.ndarray(0, dtype=object) + ) + + gene_info["ensembl_possible_replacements"] = gene_info[ + "ensembl_possible_replacements" + ].apply( + lambda row: row if isinstance(row, np.ndarray) else np.ndarray(0, dtype=object) + ) + + # Add ensembl_info as a nested field. This is done after merging all other data sets so it applies to + # all possible Ensembl IDs in all data sets. + ensembl_info = gene_info[ + [ + "ensembl_gene_id", + "ensembl_release", + "ensembl_possible_replacements", + "ensembl_permalink", + ] + ] + ensembl_info = nest_fields( + df=ensembl_info, + grouping="ensembl_gene_id", + new_column="ensembl_info", + drop_columns=["ensembl_gene_id"], + nested_field_is_list=False, + ) + + gene_info = pd.merge( + left=gene_info, + right=ensembl_info, + on="ensembl_gene_id", + how="outer", + validate="one_to_one", ) gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 @@ -179,9 +213,11 @@ def transform_gene_info( # create 'total_nominations' field gene_info["total_nominations"] = gene_info.apply( - lambda row: len(row["target_nominations"]) - if isinstance(row["target_nominations"], list) - else np.NaN, + lambda row: ( + len(row["target_nominations"]) + if isinstance(row["target_nominations"], list) + else np.NaN + ), axis=1, ) diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py index 9d278cda..c6361d3d 100644 --- a/src/agoradatatools/etl/transform/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/genes_biodomains.py @@ -103,7 +103,7 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: df=genes_biodomains, grouping="ensembl_gene_id", new_column="gene_biodomains", - drop_columns="ensembl_gene_id", + drop_columns=["ensembl_gene_id"], ) return genes_biodomains diff --git a/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv new file mode 100644 index 00000000..74ef3dfa --- /dev/null +++ b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv @@ -0,0 +1,16 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.384,-0.61,-0.157,1.1,-3.32,0.001,0.003,protein_coding,7.0,DOWN,CFTR,36.59,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.096,0.028,0.164,3.66,2.79,0.005,0.013,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,MAYO +,TCX,AD-CONTROL,ENSG00000001629,0.069,-0.034,0.171,6.84,1.32,0.188,0.267,protein_coding,7.0,NONE,ANKIB1,36.83,155410,ALL,MAYO +Diagnosis,,AD-CONTROL,ENSG00000001460,-0.043,-0.134,0.048,4.53,-0.92,0.357,0.452,protein_coding,1.0,NONE,,44.09,59936,ALL,MAYO +Diagnosis,TCX,,ENSG00000000419,-0.011,-0.073,0.052,4.98,-0.34,0.737,0.799,protein_coding,20.0,NONE,DPM1,39.85,23689,ALL,MAYO +Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.088,-0.182,0.007,4.32,-1.82,0.069,0.155,,20.0,NONE,DPM1,39.85,23689,ALL,MSSM +Diagnosis,IFG,,ENSG00000001629,0.08,0.007,0.154,7.93,2.16,0.031,0.085,protein_coding,,NONE,ANKIB1,36.83,155410,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.034,-0.114,0.046,3.81,-0.84,0.4,0.554,protein_coding,1.0,,STPG1,44.09,59936,,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.029,-0.095,0.037,4.12,-0.86,0.392,0.547,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL, +Diagnosis,IFG,AD-CONTROL,ENSG00000001626,0.026,-0.128,0.179,3.01,0.33,0.741,0.834,protein_coding,7.0,NONE,,36.59,250188,ALL,MSSM +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001626,-0.143,-0.343,0.058,-0.3,-1.39,0.163,0.296,protein_coding,7.0,NONE,CFTR,36.59,250188,ALL, +Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.088,-0.131,-0.045,3.93,-4.03,0.0,0.001,protein_coding,20.0,NONE,DPM1,39.85,23689,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.084,0.035,0.133,6.65,3.39,0.001,0.005,protein_coding,7.0,NONE,ANKIB1,36.83,155410,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.04,-0.092,0.012,4.02,-1.51,0.131,0.252,protein_coding,1.0,NONE,STPG1,44.09,59936,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.028,-0.07,0.014,2.94,-1.32,0.187,0.327,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,ROSMAP diff --git a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv new file mode 100644 index 00000000..3a809fab --- /dev/null +++ b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv @@ -0,0 +1,3 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.384,-0.61,-0.157,1.1,-3.32,0.001,string_value,protein_coding,7.0,DOWN,CFTR,36.59,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.096,0.028,0.164,3.66,2.79,0.005,0.013,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,MAYO diff --git a/tests/test_assets/gene_info/input/druggability_good_input.csv b/tests/test_assets/gene_info/input/druggability_good_input.csv new file mode 100644 index 00000000..28fe765e --- /dev/null +++ b/tests/test_assets/gene_info/input/druggability_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition +ENSG00000000005,7,4,4,3,4,4,Tbio,,Safety definition 1,Feasibility definition 1,Abability definition 1,Modaility definition 1,Tissue engagement definition 1 +ENSG00000001036,1,3,5,1,4,2,Tchem,Classification 2,,Feasibility definition 2,Abability definition 2,Modaility definition 2,Tissue engagement definition 2 +ENSG00000000460,13,3,5,3,4,4,Tdark,Classification 3,Safety definition 3,,,Modaility definition 3,Tissue engagement definition 3 +ENSG00000000971,3,4,3,1,4,5,Tbio,Classification 4,Safety definition 4,Feasibility definition 4,Abability definition 4,,Tissue engagement definition 4 +ENSG00000001084,1,5,3,3,4,2,,Classification 5,Safety definition 5,Feasibility definition 5,Abability definition 5,Modaility definition 5, diff --git a/tests/test_assets/gene_info/input/eqtl_good_input.csv b/tests/test_assets/gene_info/input/eqtl_good_input.csv new file mode 100644 index 00000000..a8b17520 --- /dev/null +++ b/tests/test_assets/gene_info/input/eqtl_good_input.csv @@ -0,0 +1,7 @@ +ensembl_gene_id,is_eqtl +ENSG00000000419,True +ENSG00000000971,True +ENSG00000001460,True +ENSG00000001626,True +ENSG00000161149, +ENSG00000001517,False diff --git a/tests/test_assets/gene_info/input/eqtl_merge_error.csv b/tests/test_assets/gene_info/input/eqtl_merge_error.csv new file mode 100644 index 00000000..38984052 --- /dev/null +++ b/tests/test_assets/gene_info/input/eqtl_merge_error.csv @@ -0,0 +1,4 @@ +ensembl_gene_id,is_eqtl +ENSG00000000419,True +ENSG00000000971,True +ENSG00000000419,True diff --git a/tests/test_assets/gene_info/input/gene_metadata_good_input.feather b/tests/test_assets/gene_info/input/gene_metadata_good_input.feather new file mode 100644 index 00000000..291717bf Binary files /dev/null and b/tests/test_assets/gene_info/input/gene_metadata_good_input.feather differ diff --git a/tests/test_assets/gene_info/input/gene_metadata_merge_error.feather b/tests/test_assets/gene_info/input/gene_metadata_merge_error.feather new file mode 100644 index 00000000..1747493c Binary files /dev/null and b/tests/test_assets/gene_info/input/gene_metadata_merge_error.feather differ diff --git a/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv new file mode 100644 index 00000000..0c111a10 --- /dev/null +++ b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv @@ -0,0 +1,7 @@ +biodomain,abbr,label,color,go_id,goterm_name,n_symbol,symbol,ensembl_gene_id +Synapse,Sy,Synapse [Sy],#329a33,GO:0005102,signaling receptor binding,362.0,TNMD,ENSG00000000005 +Proteostasis,Pr,Proteostasis [Pr],#c8b269,,endopeptidase activity,73.0,TNMD,ENSG00000000005 +Apoptosis,Ap,Apoptosis [Ap],,GO:0006915,apoptotic process,577.0,DPM1,ENSG00000000419 +Structural Stabilization,,,#ff9a9a,GO:0030863,cortical cytoskeleton,21.0,SCYL3,ENSG00000000457 +Synapse,Sy,Synapse [Sy],#329a33,GO:0034704,,,TNMD,ENSG00000000005 +,Sy,Synapse [Sy],#329a33,GO:0034704,,,TNMD,ENSG00000000005 diff --git a/tests/test_assets/gene_info/input/igap_good_input.csv b/tests/test_assets/gene_info/input/igap_good_input.csv new file mode 100644 index 00000000..0093a35a --- /dev/null +++ b/tests/test_assets/gene_info/input/igap_good_input.csv @@ -0,0 +1,7 @@ +ensembl_gene_id,hgnc_symbol +ENSG00000000419,ADAMTS1 +ENSG00000000460,APH1B +ENSG00000000971, +ENSG00000001084,UNC5CL +ENSG00000001460,ICA1 +,ICA1 diff --git a/tests/test_assets/gene_info/input/igap_merge_error.csv b/tests/test_assets/gene_info/input/igap_merge_error.csv new file mode 100644 index 00000000..ed04de6b --- /dev/null +++ b/tests/test_assets/gene_info/input/igap_merge_error.csv @@ -0,0 +1,4 @@ +ensembl_gene_id,hgnc_symbol +ENSG00000000419,ADAMTS1 +ENSG00000000460,APH1B +ENSG00000000460, diff --git a/tests/test_assets/gene_info/input/median_expression_good_input.csv b/tests/test_assets/gene_info/input/median_expression_good_input.csv new file mode 100644 index 00000000..c86caf60 --- /dev/null +++ b/tests/test_assets/gene_info/input/median_expression_good_input.csv @@ -0,0 +1,11 @@ +ensembl_gene_id,min,first_quartile,median,mean,third_quartile,max,tissue +ENSG00000000419,4.14,4.82,4.99,4.98,5.12,5.47,TCX +ENSG00000000457,1.87,3.31,3.56,3.53,3.79,4.29,TCX +ENSG00000000971,2.38,3.78,4.78,4.66,5.42,7.48, +ENSG00000001036,2.55,3.42,3.69,3.67,3.9,4.69,TCX +ENSG00000001631,2.39,3.43,3.58,3.56,3.71,4.0,TCX +ENSG00000000419,2.46,3.81,4.11,4.11,4.4,5.85,DLPFC +ENSG00000000971,1.75,3.69,4.33,4.38,4.99,8.17,DLPFC +ENSG00000000419,2.43,3.82,4.17,4.13,4.5,5.35,IFG +ENSG00000001036,2.04,3.44,3.64,3.63,3.86,4.8,IFG +ENSG00000001631,3.07,3.75,3.88,3.88,4.01,4.42,IFG diff --git a/tests/test_assets/gene_info/input/proteomics_good_input.csv b/tests/test_assets/gene_info/input/proteomics_good_input.csv new file mode 100644 index 00000000..7071f5a2 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_good_input.csv @@ -0,0 +1,17 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +DPM1|O60762,DPM1,O60762,ENSG00000000419,DLPFC,-0.001,0.109,-0.11,1.0,1.0 +GCLC|P48506,GCLC,P48506,ENSG00000001084,DLPFC,0.172,0.241,0.104,0.0,0.0 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,DLPFC,,,,, +DPM1|O60762,DPM1,O60762,ENSG00000000419,MFG,0.175,0.814,-0.463,0.784,1.0 +GCLC|P48506,GCLC,P48506,ENSG00000001084,MFG,0.011,0.23,-0.207,0.991,1.0 +CFH|P08603,CFH,P08603,ENSG00000000971,,0.005,0.8,-0.79,1.0,1.0 +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,MFG,,,,, +DPM1|O60762,DPM1,,ENSG00000000419,TCX,-0.149,0.011,-0.309,0.073,0.383 +GCLC|P48506,GCLC,P48506,ENSG00000001084,TCX,-0.043,0.042,-0.128,0.462,1.0 +CFH|P08603,CFH,P08603,ENSG00000000971,TCX,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,TCX,,,,, +DPM1|O60762,DPM1,O60762,ENSG00000000419,AntPFC,0.052,0.215,-0.111,0.734,1.0 +GCLC|P48506,,P48506,ENSG00000001084,AntPFC,-0.023,0.077,-0.123,0.848,1.0 +CFH|P08603,CFH,P08603,ENSG00000000971,AntPFC,,,,, +,CYP51A1,Q16850,ENSG00000001630,AntPFC,0.265,0.567,-0.037,0.099,0.565 diff --git a/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv b/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv new file mode 100644 index 00000000..41500f33 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CD2AP|Q9Y5K6,CD2AP,Q9Y5K6,ENSG00000001630,DLPFC,0.026,0.102,-0.05,0.696,0.952 +,SNCA,P37840,ENSG00000001629,DLPFC,-0.026,0.028,-0.08,0.505,0.72 +NDUFA7|O95182,,O95182,ENSG00000000419,DLPFC,-0.083,-0.034,-0.131,0.0,0.001 +DIP2B|Q9P265,DIP2B,,ENSG00000001036,DLPFC,0.002,0.072,-0.068,0.997, +VSNL1|P62760,VSNL1,P62760,ENSG00000000971,,-0.014,0.015,-0.042,0.493,0.718 +SYT11|Q9BT88,SYT11,Q9BT88,ENSG00000000005,DLPFC,0.029,0.074,-0.016,0.281,0.496 diff --git a/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv new file mode 100644 index 00000000..42db103b --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv @@ -0,0 +1,3 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CD2AP|Q9Y5K6,CD2AP,Q9Y5K6,ENSG00000001630,DLPFC,0.026,0.102,-0.05,0.696,string_value +,SNCA,P37840,ENSG00000001629,DLPFC,-0.026,0.028,-0.08,0.505,0.72 diff --git a/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv b/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv new file mode 100644 index 00000000..c59d93ed --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CYP51A1|A0A0C4DFL7,CYP51A1,A0A0C4DFL7,ENSG00000001630,DLPFC,-0.097,-0.025,-0.168,0.009,0.06 +ANKIB1|Q9P2G1,ANKIB1,Q9P2G1,ENSG00000001629,,-0.023,-0.003,-0.044,0.027,0.124 +,DPM1,H0Y368,ENSG00000000419,DLPFC,-0.032,-0.002,-0.062,0.038,0.153 +FUCA2|Q9BTY2,,Q9BTY2,ENSG00000001036,DLPFC,-0.089,0.006,-0.184,0.066,0.213 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,0.156,-0.004,0.062,0.205 +KRIT1|O00522,KRIT1,,ENSG00000001631,DLPFC,0.053,0.121,-0.015,0.128,0.319 diff --git a/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv new file mode 100644 index 00000000..b164f563 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv @@ -0,0 +1,3 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CYP51A1|A0A0C4DFL7,CYP51A1,A0A0C4DFL7,ENSG00000001630,DLPFC,-0.097,-0.025,-0.168,0.009,string_value +ANKIB1|Q9P2G1,ANKIB1,Q9P2G1,ENSG00000001629,,-0.023,-0.003,-0.044,0.027,0.124 diff --git a/tests/test_assets/gene_info/input/proteomics_type_error.csv b/tests/test_assets/gene_info/input/proteomics_type_error.csv new file mode 100644 index 00000000..23cb89b6 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_type_error.csv @@ -0,0 +1,3 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +DPM1|O60762,DPM1,O60762,ENSG00000000419,DLPFC,-0.001,0.109,-0.11,1.0,string_value +GCLC|P48506,GCLC,P48506,ENSG00000001084,DLPFC,0.172,0.241,0.104,0.0,0.0 diff --git a/tests/test_assets/gene_info/input/target_list_good_input.csv b/tests/test_assets/gene_info/input/target_list_good_input.csv new file mode 100644 index 00000000..b6796717 --- /dev/null +++ b/tests/test_assets/gene_info/input/target_list_good_input.csv @@ -0,0 +1,6 @@ +source,team,rank,ensembl_gene_id,hgnc_symbol,target_choice_justification,predicted_therapeutic_direction,data_used_to_support_target_selection,data_synapseid,study,input_data,validation_study_details,initial_nomination +Source_1,Team_1,17-100,ENSG00000000005,TNMD,Justification 1,Prediction 1,Support 1,syn12345,Study_1,"Genetics, RNA, Protein, Clinical",,2018.0 +Source_1,Team_2,,ENSG00000000419,DPM1,Justification 2,Prediction 2,Support 2,,Study_1,,Validation 2,2023.0 +Source_2,Team_3,12,ENSG00000000419,DPM1,Justification 3,,Support 3,,Study_2,RNA,Validation 3,2022.0 +Source_3,Team_4,1-10,ENSG00000000419,DPM1,,Prediction 4,Support 4,syn56789,Study_3,"Protein, Clinical",Validation 4,2018.0 +Source_4,Team_5,,ENSG00000000457,SCYL3,Justification 5,Prediction 5,,,Study_3,RNA,Validation 5, diff --git a/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv new file mode 100644 index 00000000..ae081666 --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,True,True +ENSG00000000419,DPM1,,False +ENSG00000001497,LAS1L,True, +ENSG00000001084,GCLC,,True +ENSG00000183791,ABCD,, diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv new file mode 100644 index 00000000..ec127faa --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv @@ -0,0 +1,3 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,,True,True +ENSG00000000419,DPM1,,False diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv new file mode 100644 index 00000000..061eb48a --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv @@ -0,0 +1,3 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,string_value,True +ENSG00000000419,DPM1,,False diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv new file mode 100644 index 00000000..c4703236 --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv @@ -0,0 +1,3 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,True,string_value +ENSG00000000419,DPM1,,False diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_1.json b/tests/test_assets/gene_info/output/gene_info_good_output_1.json new file mode 100644 index 00000000..84f16154 --- /dev/null +++ b/tests/test_assets/gene_info/output/gene_info_good_output_1.json @@ -0,0 +1,851 @@ +[ + { + "ensembl_gene_id": "ENSG00000000005", + "name": "tenomodulin...", + "summary": "Summary 1", + "symbol": "TNMD", + "alias": [ + "BRICD4", + "CHM1L", + "TEM" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_1", + "rank": "17-100", + "hgnc_symbol": "TNMD", + "target_choice_justification": "Justification 1", + "predicted_therapeutic_direction": "Prediction 1", + "data_used_to_support_target_selection": "Support 1", + "data_synapseid": "syn12345", + "study": "Study_1", + "input_data": "Genetics, RNA, Protein, Clinical", + "validation_study_details": null, + "initial_nomination": 2018.0 + } + ], + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 7, + "safety_bucket": 4, + "abability_bucket": 3, + "pharos_class": "Tbio", + "classification": null, + "safety_bucket_definition": "Safety definition 1", + "abability_bucket_definition": "Abability definition 1" + } + ], + "total_nominations": 1.0, + "biodomains": [ + "Proteostasis", + "Synapse" + ], + "is_adi": true, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22TNMD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" + } + }, + { + "ensembl_gene_id": "ENSG00000000419", + "name": "dolichyl-phosphate m...", + "summary": "Summary 2", + "symbol": "DPM1", + "alias": [ + "MPDS", + "CDGIE" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_2", + "rank": null, + "hgnc_symbol": "DPM1", + "target_choice_justification": "Justification 2", + "predicted_therapeutic_direction": "Prediction 2", + "data_used_to_support_target_selection": "Support 2", + "data_synapseid": null, + "study": "Study_1", + "input_data": null, + "validation_study_details": "Validation 2", + "initial_nomination": 2023.0 + }, + { + "source": "Source_2", + "team": "Team_3", + "rank": "12", + "hgnc_symbol": "DPM1", + "target_choice_justification": "Justification 3", + "predicted_therapeutic_direction": null, + "data_used_to_support_target_selection": "Support 3", + "data_synapseid": null, + "study": "Study_2", + "input_data": "RNA", + "validation_study_details": "Validation 3", + "initial_nomination": 2022.0 + }, + { + "source": "Source_3", + "team": "Team_4", + "rank": "1-10", + "hgnc_symbol": "DPM1", + "target_choice_justification": null, + "predicted_therapeutic_direction": "Prediction 4", + "data_used_to_support_target_selection": "Support 4", + "data_synapseid": "syn56789", + "study": "Study_3", + "input_data": "Protein, Clinical", + "validation_study_details": "Validation 4", + "initial_nomination": 2018.0 + } + ], + "median_expression": [ + { + "min": 4.14, + "first_quartile": 4.82, + "median": 4.99, + "mean": 4.98, + "third_quartile": 5.12, + "max": 5.47, + "tissue": "TCX" + }, + { + "min": 2.46, + "first_quartile": 3.81, + "median": 4.11, + "mean": 4.11, + "third_quartile": 4.4, + "max": 5.85, + "tissue": "DLPFC" + }, + { + "min": 2.43, + "first_quartile": 3.82, + "median": 4.17, + "mean": 4.13, + "third_quartile": 4.5, + "max": 5.35, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": 3.0, + "biodomains": [ + "Apoptosis" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" + } + }, + { + "ensembl_gene_id": "ENSG00000000457", + "name": "SCY1 like pseudokina...", + "summary": "Summary 3", + "symbol": "SCYL3", + "alias": [ + "PACE-1", + "PACE1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": [ + { + "source": "Source_4", + "team": "Team_5", + "rank": null, + "hgnc_symbol": "SCYL3", + "target_choice_justification": "Justification 5", + "predicted_therapeutic_direction": "Prediction 5", + "data_used_to_support_target_selection": null, + "data_synapseid": null, + "study": "Study_3", + "input_data": "RNA", + "validation_study_details": "Validation 5", + "initial_nomination": null + } + ], + "median_expression": [ + { + "min": 1.87, + "first_quartile": 3.31, + "median": 3.56, + "mean": 3.53, + "third_quartile": 3.79, + "max": 4.29, + "tissue": "TCX" + } + ], + "druggability": null, + "total_nominations": 1.0, + "biodomains": [ + "Structural Stabilization" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" + } + }, + { + "ensembl_gene_id": "ENSG00000000460", + "name": "FIGNL1 interacting r...", + "summary": null, + "symbol": "FIRRM", + "alias": [ + "FLIP", + "MEICA1", + "C1orf112", + "Apolo1" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 13, + "safety_bucket": 3, + "abability_bucket": 3, + "pharos_class": "Tdark", + "classification": "Classification 3", + "safety_bucket_definition": "Safety definition 3", + "abability_bucket_definition": null + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" + } + }, + { + "ensembl_gene_id": "ENSG00000000938", + "name": "FGR proto-oncogene, ...", + "summary": "Summary 5", + "symbol": "FGR", + "alias": [ + "c-fgr", + "SRC2", + "p55c-fgr", + "p58-Fgr", + "p55-Fgr", + "p58c-fgr", + "c-src2" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" + } + }, + { + "ensembl_gene_id": "ENSG00000000971", + "name": null, + "summary": "Summary 6", + "symbol": "CFH", + "alias": [], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.38, + "first_quartile": 3.78, + "median": 4.78, + "mean": 4.66, + "third_quartile": 5.42, + "max": 7.48, + "tissue": null + }, + { + "min": 1.75, + "first_quartile": 3.69, + "median": 4.33, + "mean": 4.38, + "third_quartile": 4.99, + "max": 8.17, + "tissue": "DLPFC" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 3, + "safety_bucket": 4, + "abability_bucket": 1, + "pharos_class": "Tbio", + "classification": "Classification 4", + "safety_bucket_definition": "Safety definition 4", + "abability_bucket_definition": "Abability definition 4" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" + } + }, + { + "ensembl_gene_id": "ENSG00000001036", + "name": "alpha-L-fucosidase 2...", + "summary": "Summary 7", + "symbol": "FUCA2", + "alias": [ + "dJ20N2.5" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.55, + "first_quartile": 3.42, + "median": 3.69, + "mean": 3.67, + "third_quartile": 3.9, + "max": 4.69, + "tissue": "TCX" + }, + { + "min": 2.04, + "first_quartile": 3.44, + "median": 3.64, + "mean": 3.63, + "third_quartile": 3.86, + "max": 4.8, + "tissue": "IFG" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 3, + "abability_bucket": 1, + "pharos_class": "Tchem", + "classification": "Classification 2", + "safety_bucket_definition": null, + "abability_bucket_definition": "Abability definition 2" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" + } + }, + { + "ensembl_gene_id": "ENSG00000001084", + "name": "glutamate-cysteine l...", + "summary": "Summary 8", + "symbol": "GCLC", + "alias": [ + "GLCL", + "GCL", + "GLCLC", + "GCS" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 5, + "abability_bucket": 3, + "pharos_class": null, + "classification": "Classification 5", + "safety_bucket_definition": "Safety definition 5", + "abability_bucket_definition": "Abability definition 5" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22GCLC%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + }, + { + "ensembl_gene_id": "ENSG00000001167", + "name": "nuclear transcriptio...", + "summary": "Summary 9", + "symbol": "NFYA", + "alias": [ + "CBF-B", + "NF-YA", + "HAP2", + "CBF-A" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" + } + }, + { + "ensembl_gene_id": "ENSG00000001460", + "name": "sperm tail PG-rich r...", + "summary": "Summary 10", + "symbol": "STPG1", + "alias": [ + "C1orf201", + "MAPO2" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" + } + }, + { + "ensembl_gene_id": "ENSG00000001461", + "name": "NIPA like domain con...", + "summary": "Summary 11", + "symbol": null, + "alias": [ + "DJ462O23.2", + "SLC57A5", + "NPAL3" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" + } + }, + { + "ensembl_gene_id": "ENSG00000001497", + "name": "LAS1 like ribosome b...", + "summary": "Summary 12", + "symbol": "LAS1L", + "alias": [ + "Las1-like", + "Las1", + "dJ475B7.2", + "MRXSWTS", + "WTS" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": true, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22LAS1L%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" + } + }, + { + "ensembl_gene_id": "ENSG00000001561", + "name": "ectonucleotide pyrop...", + "summary": "Summary 13", + "symbol": "ENPP4", + "alias": [ + "NPP4" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" + } + }, + { + "ensembl_gene_id": "ENSG00000001617", + "name": "semaphorin 3F...", + "summary": "Summary 14", + "symbol": "SEMA3F", + "alias": [ + "SEMA4", + "SEMA-IV", + "SEMAK" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" + } + }, + { + "ensembl_gene_id": "ENSG00000001626", + "name": "CF transmembrane con...", + "summary": "Summary 15", + "symbol": "CFTR", + "alias": [ + "ABCC7", + "MRP7", + "CFTR/MRP", + "dJ760C5.1", + "TNR-CFTR", + "ABC35", + "CF" + ], + "is_igap": false, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" + } + }, + { + "ensembl_gene_id": "ENSG00000001629", + "name": "ankyrin repeat and I...", + "summary": "Summary 16", + "symbol": "ANKIB1", + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" + } + }, + { + "ensembl_gene_id": "ENSG00000001630", + "name": "cytochrome P450 fami...", + "summary": "Summary 17", + "symbol": "CYP51A1", + "alias": [ + "P450L1", + "P450-14DM", + "LDM", + "CP51", + "CYPL1", + "CYP51" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" + } + }, + { + "ensembl_gene_id": "ENSG00000001631", + "name": "KRIT1 ankyrin repeat...", + "summary": "Summary 18", + "symbol": "KRIT1", + "alias": [ + "CAM", + "CCM1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.39, + "first_quartile": 3.43, + "median": 3.58, + "mean": 3.56, + "third_quartile": 3.71, + "max": 4.0, + "tissue": "TCX" + }, + { + "min": 3.07, + "first_quartile": 3.75, + "median": 3.88, + "mean": 3.88, + "third_quartile": 4.01, + "max": 4.42, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" + } + }, + { + "ensembl_gene_id": "ENSG00000161149", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "107", + "ensembl_possible_replacements": [ + "ENSG00000284130" + ], + "ensembl_permalink": "https://jul2022.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000161149" + } + }, + { + "ensembl_gene_id": "ENSG00000183791", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "100", + "ensembl_possible_replacements": [ + "ENSG00000288631", + "ENSG00000288616", + "ENSG00000288607" + ], + "ensembl_permalink": "https://apr2020.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000183791" + } + }, + { + "ensembl_gene_id": "ENSG00000001517", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + } +] \ No newline at end of file diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_2.json b/tests/test_assets/gene_info/output/gene_info_good_output_2.json new file mode 100644 index 00000000..b57e63aa --- /dev/null +++ b/tests/test_assets/gene_info/output/gene_info_good_output_2.json @@ -0,0 +1,851 @@ +[ + { + "ensembl_gene_id": "ENSG00000000005", + "name": "tenomodulin...", + "summary": "Summary 1", + "symbol": "TNMD", + "alias": [ + "BRICD4", + "CHM1L", + "TEM" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_1", + "rank": "17-100", + "hgnc_symbol": "TNMD", + "target_choice_justification": "Justification 1", + "predicted_therapeutic_direction": "Prediction 1", + "data_used_to_support_target_selection": "Support 1", + "data_synapseid": "syn12345", + "study": "Study_1", + "input_data": "Genetics, RNA, Protein, Clinical", + "validation_study_details": null, + "initial_nomination": 2018.0 + } + ], + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 7, + "safety_bucket": 4, + "abability_bucket": 3, + "pharos_class": "Tbio", + "classification": null, + "safety_bucket_definition": "Safety definition 1", + "abability_bucket_definition": "Abability definition 1" + } + ], + "total_nominations": 1.0, + "biodomains": [ + "Proteostasis", + "Synapse" + ], + "is_adi": true, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22TNMD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" + } + }, + { + "ensembl_gene_id": "ENSG00000000419", + "name": "dolichyl-phosphate m...", + "summary": "Summary 2", + "symbol": "DPM1", + "alias": [ + "MPDS", + "CDGIE" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_2", + "rank": null, + "hgnc_symbol": "DPM1", + "target_choice_justification": "Justification 2", + "predicted_therapeutic_direction": "Prediction 2", + "data_used_to_support_target_selection": "Support 2", + "data_synapseid": null, + "study": "Study_1", + "input_data": null, + "validation_study_details": "Validation 2", + "initial_nomination": 2023.0 + }, + { + "source": "Source_2", + "team": "Team_3", + "rank": "12", + "hgnc_symbol": "DPM1", + "target_choice_justification": "Justification 3", + "predicted_therapeutic_direction": null, + "data_used_to_support_target_selection": "Support 3", + "data_synapseid": null, + "study": "Study_2", + "input_data": "RNA", + "validation_study_details": "Validation 3", + "initial_nomination": 2022.0 + }, + { + "source": "Source_3", + "team": "Team_4", + "rank": "1-10", + "hgnc_symbol": "DPM1", + "target_choice_justification": null, + "predicted_therapeutic_direction": "Prediction 4", + "data_used_to_support_target_selection": "Support 4", + "data_synapseid": "syn56789", + "study": "Study_3", + "input_data": "Protein, Clinical", + "validation_study_details": "Validation 4", + "initial_nomination": 2018.0 + } + ], + "median_expression": [ + { + "min": 4.14, + "first_quartile": 4.82, + "median": 4.99, + "mean": 4.98, + "third_quartile": 5.12, + "max": 5.47, + "tissue": "TCX" + }, + { + "min": 2.46, + "first_quartile": 3.81, + "median": 4.11, + "mean": 4.11, + "third_quartile": 4.4, + "max": 5.85, + "tissue": "DLPFC" + }, + { + "min": 2.43, + "first_quartile": 3.82, + "median": 4.17, + "mean": 4.13, + "third_quartile": 4.5, + "max": 5.35, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": 3.0, + "biodomains": [ + "Apoptosis" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" + } + }, + { + "ensembl_gene_id": "ENSG00000000457", + "name": "SCY1 like pseudokina...", + "summary": "Summary 3", + "symbol": "SCYL3", + "alias": [ + "PACE-1", + "PACE1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": [ + { + "source": "Source_4", + "team": "Team_5", + "rank": null, + "hgnc_symbol": "SCYL3", + "target_choice_justification": "Justification 5", + "predicted_therapeutic_direction": "Prediction 5", + "data_used_to_support_target_selection": null, + "data_synapseid": null, + "study": "Study_3", + "input_data": "RNA", + "validation_study_details": "Validation 5", + "initial_nomination": null + } + ], + "median_expression": [ + { + "min": 1.87, + "first_quartile": 3.31, + "median": 3.56, + "mean": 3.53, + "third_quartile": 3.79, + "max": 4.29, + "tissue": "TCX" + } + ], + "druggability": null, + "total_nominations": 1.0, + "biodomains": [ + "Structural Stabilization" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" + } + }, + { + "ensembl_gene_id": "ENSG00000000460", + "name": "FIGNL1 interacting r...", + "summary": null, + "symbol": "FIRRM", + "alias": [ + "FLIP", + "MEICA1", + "C1orf112", + "Apolo1" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 13, + "safety_bucket": 3, + "abability_bucket": 3, + "pharos_class": "Tdark", + "classification": "Classification 3", + "safety_bucket_definition": "Safety definition 3", + "abability_bucket_definition": null + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" + } + }, + { + "ensembl_gene_id": "ENSG00000000938", + "name": "FGR proto-oncogene, ...", + "summary": "Summary 5", + "symbol": "FGR", + "alias": [ + "c-fgr", + "SRC2", + "p55c-fgr", + "p58-Fgr", + "p55-Fgr", + "p58c-fgr", + "c-src2" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" + } + }, + { + "ensembl_gene_id": "ENSG00000000971", + "name": null, + "summary": "Summary 6", + "symbol": "CFH", + "alias": [], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.38, + "first_quartile": 3.78, + "median": 4.78, + "mean": 4.66, + "third_quartile": 5.42, + "max": 7.48, + "tissue": null + }, + { + "min": 1.75, + "first_quartile": 3.69, + "median": 4.33, + "mean": 4.38, + "third_quartile": 4.99, + "max": 8.17, + "tissue": "DLPFC" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 3, + "safety_bucket": 4, + "abability_bucket": 1, + "pharos_class": "Tbio", + "classification": "Classification 4", + "safety_bucket_definition": "Safety definition 4", + "abability_bucket_definition": "Abability definition 4" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" + } + }, + { + "ensembl_gene_id": "ENSG00000001036", + "name": "alpha-L-fucosidase 2...", + "summary": "Summary 7", + "symbol": "FUCA2", + "alias": [ + "dJ20N2.5" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.55, + "first_quartile": 3.42, + "median": 3.69, + "mean": 3.67, + "third_quartile": 3.9, + "max": 4.69, + "tissue": "TCX" + }, + { + "min": 2.04, + "first_quartile": 3.44, + "median": 3.64, + "mean": 3.63, + "third_quartile": 3.86, + "max": 4.8, + "tissue": "IFG" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 3, + "abability_bucket": 1, + "pharos_class": "Tchem", + "classification": "Classification 2", + "safety_bucket_definition": null, + "abability_bucket_definition": "Abability definition 2" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" + } + }, + { + "ensembl_gene_id": "ENSG00000001084", + "name": "glutamate-cysteine l...", + "summary": "Summary 8", + "symbol": "GCLC", + "alias": [ + "GLCL", + "GCL", + "GLCLC", + "GCS" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 5, + "abability_bucket": 3, + "pharos_class": null, + "classification": "Classification 5", + "safety_bucket_definition": "Safety definition 5", + "abability_bucket_definition": "Abability definition 5" + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22GCLC%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + }, + { + "ensembl_gene_id": "ENSG00000001167", + "name": "nuclear transcriptio...", + "summary": "Summary 9", + "symbol": "NFYA", + "alias": [ + "CBF-B", + "NF-YA", + "HAP2", + "CBF-A" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" + } + }, + { + "ensembl_gene_id": "ENSG00000001460", + "name": "sperm tail PG-rich r...", + "summary": "Summary 10", + "symbol": "STPG1", + "alias": [ + "C1orf201", + "MAPO2" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" + } + }, + { + "ensembl_gene_id": "ENSG00000001461", + "name": "NIPA like domain con...", + "summary": "Summary 11", + "symbol": null, + "alias": [ + "DJ462O23.2", + "SLC57A5", + "NPAL3" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" + } + }, + { + "ensembl_gene_id": "ENSG00000001497", + "name": "LAS1 like ribosome b...", + "summary": "Summary 12", + "symbol": "LAS1L", + "alias": [ + "Las1-like", + "Las1", + "dJ475B7.2", + "MRXSWTS", + "WTS" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": true, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22LAS1L%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" + } + }, + { + "ensembl_gene_id": "ENSG00000001561", + "name": "ectonucleotide pyrop...", + "summary": "Summary 13", + "symbol": "ENPP4", + "alias": [ + "NPP4" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" + } + }, + { + "ensembl_gene_id": "ENSG00000001617", + "name": "semaphorin 3F...", + "summary": "Summary 14", + "symbol": "SEMA3F", + "alias": [ + "SEMA4", + "SEMA-IV", + "SEMAK" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" + } + }, + { + "ensembl_gene_id": "ENSG00000001626", + "name": "CF transmembrane con...", + "summary": "Summary 15", + "symbol": "CFTR", + "alias": [ + "ABCC7", + "MRP7", + "CFTR/MRP", + "dJ760C5.1", + "TNR-CFTR", + "ABC35", + "CF" + ], + "is_igap": false, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" + } + }, + { + "ensembl_gene_id": "ENSG00000001629", + "name": "ankyrin repeat and I...", + "summary": "Summary 16", + "symbol": "ANKIB1", + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" + } + }, + { + "ensembl_gene_id": "ENSG00000001630", + "name": "cytochrome P450 fami...", + "summary": "Summary 17", + "symbol": "CYP51A1", + "alias": [ + "P450L1", + "P450-14DM", + "LDM", + "CP51", + "CYPL1", + "CYP51" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" + } + }, + { + "ensembl_gene_id": "ENSG00000001631", + "name": "KRIT1 ankyrin repeat...", + "summary": "Summary 18", + "symbol": "KRIT1", + "alias": [ + "CAM", + "CCM1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.39, + "first_quartile": 3.43, + "median": 3.58, + "mean": 3.56, + "third_quartile": 3.71, + "max": 4.0, + "tissue": "TCX" + }, + { + "min": 3.07, + "first_quartile": 3.75, + "median": 3.88, + "mean": 3.88, + "third_quartile": 4.01, + "max": 4.42, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" + } + }, + { + "ensembl_gene_id": "ENSG00000161149", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "107", + "ensembl_possible_replacements": [ + "ENSG00000284130" + ], + "ensembl_permalink": "https://jul2022.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000161149" + } + }, + { + "ensembl_gene_id": "ENSG00000183791", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "100", + "ensembl_possible_replacements": [ + "ENSG00000288631", + "ENSG00000288616", + "ENSG00000288607" + ], + "ensembl_permalink": "https://apr2020.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000183791" + } + }, + { + "ensembl_gene_id": "ENSG00000001517", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + } +] \ No newline at end of file diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py new file mode 100644 index 00000000..c4449eb9 --- /dev/null +++ b/tests/transform/test_gene_info.py @@ -0,0 +1,303 @@ +"""Integration test for the gene_info transform. + +This transform requires 12 different input datasets and tests several conditions in each one. Description and passing +or failing input for each dataset: + gene_metadata: collection of information like gene symbol, aliases, etc about each Ensembl ID in every dataset. + Both the "alias" and "ensembl_possible_replacements" fields are lists of strings, so this dataset + needs to be saved in feather format and cannot be written as a csv file. + passing input: any field can have missing values, so test file has rows with missing values covering each + column, including ensembl_gene_id. There should be no duplicate Ensembl IDs. + failing input: duplicate Ensembl IDs should throw a MergeError + igap: list of genes that are associated with AD as determined by GWAS + passing input: either field can have missing values, so test file has a row with a missing hgnc_symbol. Ensembl + IDs should be unique. + failing input: duplicate Ensembl IDs should throw a MergeError + eqtl: list of genes that are associated with AD as determined by eQTL meta-analysis + passing input: either field can have missing values, so test file has a row with a missing has_eqtl value. + Ensembl IDs should be unique. + failing input: duplicate Ensembl IDs should throw a MergeError + proteomics: differential expression for proteomics LFQ data + passing input: any field can have missing values. Rows that are missing at least one of "log2_fc", "ci_upr", + "ci_lwr", or "cor_pval" are dropped, so there are several rows missing at least one of these + values in the test file. "cor_pval" must be numeric. The full LFQ dataset has data for multiple + tissues, so the test file contains several rows with the same Ensembl ID but different tissue + values to replicate that. + failing input: a string value in the "cor_pval" column should throw a TypeError. + diff_exp_data: differential expression for RNA-seq data + passing input: any field can have missing values, however for this dataset we assume that validation has been + done on the dataset before ingest and that "logfc", "ci_l", "ci_r", and "adj_p_val" have no + missing values, so we do not test missing values in these columns. Missing values are not handled + or dropped in the transform. "adj_p_val" must be numeric. Duplicate Ensembl IDs are allowed due + to multiple studies, so the test file has rows with the same Ensembl ID but different studies. + failing input: a string value in the "adj_p_val" column should throw a TypeError. + proteomics_tmt: differential expression for proteomics TMT data + passing input: this dataset has identical format to "proteomics", so the same passing rules apply. The full + TMT dataset only has data for one tissue, so all rows have the same tissue value in the test + file. Duplicate Ensembl IDs are technically allowed, but this doesn't happen (or make sense) in + the full data set so we do not test for it. + failing input: a string value in the "cor_pval" column should throw a TypeError. + proteomics_srm: differential expression for proteomics SRM data + passing input: this dataset has identical format to "proteomics", so the same passing rules apply. The full + SRM dataset only has data for one tissue, so all rows have the same tissue value in the test + file. Duplicate Ensembl IDs are technically allowed, but this doesn't happen (or make sense) in + the full data set so we do not test for it. + failing input: a string value in the "cor_pval" column should throw a TypeError. + target_list: a list of nomination information like source, justification, etc for Ensembl IDs that are nominated as + potential drug targets. + passing input: any field can be missing, and duplicate Ensembl IDs are allowed, so the test file contains rows + with the same Ensembl ID but different sources, and also has rows with missing values in at least + one column. + failing input: none + median_expression: distribution statistics (min, max, quartiles, etc) of RNA-seq data. + passing input: technically any field can be missing, but we assume that this data has been validated prior to + ingest and has no missing values in any numerical field. We only test the case where the + "tissue" value is missing. Duplicate Ensembl IDs are allowed due to multiple tissues, so the test + file has several rows with the same Ensembl ID but different tissue value. + failing input: none + druggability: information on the druggability and safety of each gene. + passing input: any field can be missing, so there are a few rows with missing data in at least one column. + Duplicate Ensembl IDs are technically allowed, but this does not happen (or make sense) in the + full dataset, so we do not test it. + failing input: none + genes_biodomains: a list of Ensembl IDs and their associated biodomains and GO terms. + passing input: any field can be missing, so the test file has rows with missing data in at least one column. + Rows with a missing "biodomain" are dropped in the transform. Duplicate Ensembl IDs are allowed + due to association with multiple biodomains and GO terms, so the test file has rows with the same + Ensembl ID but different biodomain and/or GO term values. + failing input: none + tep_adi_info: a list of Ensembl IDs and whether they are in the AD Informer set, the TEP set, both, or neither. + passing input: hgnc_symbol must not be missing, but is_adi and is_tep can have missing values. Missing is_adi + or is_tep values are assumed to mean "False". These two fields must have boolean values if the + data isn't missing. Ensembl IDs should be unique. + failing input: a missing hgnc_symbol or a string value in is_adi or is_tep should throw a TypeError. + +Other notes about the test files: + Missing Ensembl IDs: these are allowed in any dataset, and rows with missing IDs will get dropped in the transform. + Since all datasets are merged into "gene_metadata" the same way, it is not necessary to put a + missing Ensembl ID in every test file. Instead, only "gene_metadata" and "igap" have rows with + a missing Ensembl ID to test the merge and drop. + Overlap of Ensembl IDs: all test files contain only Ensembl IDs that exist in gene_metadata, except for "eqtl". + "gene_metadata" should ideally already contain all Ensembl IDs that are present in any + data set, but there is an edge case where a new or updated data set containing new IDs is + added but "gene_metadata" hasn't been updated to pull those new IDs in. In the transform + this results in the "ensembl_info" field being "null" instead of a dictionary of null + values. To test that the transform properly turns the null into the correct dictionary + format, the test file for "eqtl" contains one Ensembl ID that is not present in + "gene_metadata". +""" + +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import gene_info + + +class TestTransformGeneInfo: + """Tests the gene_info transform. This transform requires 12 different data files, so this test class contains a + util function to read them all in, formatted as transform_gene_info expects. + """ + + data_files_path = "tests/test_assets/gene_info" + param_set_1 = { + "adjusted_p_value_threshold": 0.05, + "protein_level_threshold": 0.05, + } + param_set_2 = { + "adjusted_p_value_threshold": 1, + "protein_level_threshold": 1, + } + + core_files = { + "gene_metadata": "gene_metadata_good_input.feather", + "igap": "igap_good_input.csv", + "eqtl": "eqtl_good_input.csv", + "proteomics": "proteomics_good_input.csv", + "diff_exp_data": "diff_exp_data_good_input.csv", + "proteomics_tmt": "proteomics_tmt_good_input.csv", + "proteomics_srm": "proteomics_srm_good_input.csv", + "target_list": "target_list_good_input.csv", + "median_expression": "median_expression_good_input.csv", + "druggability": "druggability_good_input.csv", + "genes_biodomains": "genes_biodomains_good_input.csv", + "tep_adi_info": "tep_adi_info_good_input.csv", + } + + pval_error_match_string = "'<=' not supported" + merge_error_match_string = "Merge keys are not unique" + + pass_test_data = [ + ( # Pass with good data on param set 1 + core_files, + "gene_info_good_output_1.json", + param_set_1, + ), + ( # Pass with good data on param set 2 + core_files, + "gene_info_good_output_2.json", + param_set_2, + ), + ] + pass_test_ids = [ + "Pass with good data on parameter set 1", + "Pass with good data on parameter set 2", + ] + fail_test_data = [ + ( # Duplicate Ensembl IDs in gene_metadata + core_files, + {"gene_metadata": "gene_metadata_merge_error.feather"}, + param_set_1, + pd.errors.MergeError, + merge_error_match_string, + ), + ( # Duplicate Ensembl IDs in igap + core_files, + {"igap": "igap_merge_error.csv"}, + param_set_1, + pd.errors.MergeError, + merge_error_match_string, + ), + ( # Duplicate Ensembl IDs in eqtl + core_files, + {"eqtl": "eqtl_merge_error.csv"}, + param_set_1, + pd.errors.MergeError, + merge_error_match_string, + ), + ( # Bad data type in diff_exp_data + core_files, + {"diff_exp_data": "diff_exp_data_type_error.csv"}, + param_set_1, + TypeError, + pval_error_match_string, + ), + ( # Bad data type in proteomics + core_files, + {"proteomics": "proteomics_type_error.csv"}, + param_set_1, + TypeError, + pval_error_match_string, + ), + ( # Bad data type in proteomics_tmt + core_files, + {"proteomics_tmt": "proteomics_tmt_type_error.csv"}, + param_set_1, + TypeError, + pval_error_match_string, + ), + ( # Bad data type in proteomics_srm + core_files, + {"proteomics_srm": "proteomics_srm_type_error.csv"}, + param_set_1, + TypeError, + pval_error_match_string, + ), + ( # Missing HGNC in tep_adi_info + core_files, + {"tep_adi_info": "tep_adi_info_type_error_1.csv"}, + param_set_1, + TypeError, + "can only concatenate str", + ), + ( # is_adi is a string + core_files, + {"tep_adi_info": "tep_adi_info_type_error_2.csv"}, + param_set_1, + TypeError, + "'is_adi' column must be 'bool'", + ), + ( # is_tep is a string + core_files, + {"tep_adi_info": "tep_adi_info_type_error_3.csv"}, + param_set_1, + TypeError, + "'is_tep' column must be 'bool'", + ), + ] + fail_test_ids = [ + "Fail with duplicate Ensembl IDs in gene_metadata", + "Fail with duplicate Ensembl IDs in igap", + "Fail with duplicate Ensembl IDs in eqtl", + "Fail with bad data type in diff_exp_data's adj_p_val column", + "Fail with bad data type in proteomics's cor_pval column", + "Fail with bad data type in proteomics_tmt's cor_pval column", + "Fail with bad data type in proteomics_srm's cor_pval column", + "Fail with missing hgnc_symbol in tep_adi_info", + "Fail with bad data type in tep_adi_info's is_adi column", + "Fail with bad data type in tep_adi_info's is_tep column", + ] + + def read_input_files_dict(self, input_files_dict: dict) -> dict: + """Utility function to read a dictionary of filenames into a dictionary of data frames. Most files for + gene_info are in csv format, but the 'gene_metadata' file is in feather format and needs special casing. + + Args: + input_files_dict: a dictionary where keys are the names of the datasets, as expected by + transform_gene_info, and values are the filenames to load + + Returns: + datasets: a dictionary where the keys are the names of the datasets, as expected by + transform_gene_info, and the values are data frames + """ + datasets = {} + for key, value in input_files_dict.items(): + filename = os.path.join(self.data_files_path, "input", value) + if key == "gene_metadata": + datasets[key] = pd.read_feather(filename) + else: + datasets[key] = pd.read_csv(filename) + + return datasets + + @pytest.mark.parametrize( + "input_files_dict, expected_output_file, param_set", + pass_test_data, + ids=pass_test_ids, + ) + def test_transform_gene_info_should_pass( + self, input_files_dict: dict, expected_output_file: str, param_set: dict + ): + datasets = self.read_input_files_dict(input_files_dict) + + output_df = gene_info.transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=param_set["adjusted_p_value_threshold"], + protein_level_threshold=param_set["protein_level_threshold"], + ) + + # Index needs to be reset because of dropping NA rows + output_df = output_df.reset_index(drop=True) + + json_file = os.path.join(self.data_files_path, "output", expected_output_file) + expected_df = pd.read_json(json_file) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize( + "input_files_dict, failure_case_files_dict, param_set, error_type, error_match_string", + fail_test_data, + ids=fail_test_ids, + ) + def test_transform_gene_info_should_fail( + self, + input_files_dict: dict, + failure_case_files_dict: dict, + param_set: dict, + error_type: BaseException, + error_match_string: str, + ): + # Need to make a copy, otherwise this edits the original dictionary and persists through all the tests + updated_files_dict = input_files_dict.copy() + + # Any files specified in 'failure_case_files_dict' will replace their default "good" files in input_files_dict + for key, value in failure_case_files_dict.items(): + updated_files_dict[key] = value + + with pytest.raises(error_type, match=error_match_string): + datasets = self.read_input_files_dict(updated_files_dict) + + gene_info.transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=param_set["adjusted_p_value_threshold"], + protein_level_threshold=param_set["protein_level_threshold"], + )