From c589dda85aed68a87932340f52e6586d41d57907 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 6 Jul 2023 14:22:36 -0700 Subject: [PATCH 1/6] Added some clarifying comments to the distribution_data transform --- .../etl/transform/distribution_data.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py index 17e7e3fb..7ce7c55a 100644 --- a/src/agoradatatools/etl/transform/distribution_data.py +++ b/src/agoradatatools/etl/transform/distribution_data.py @@ -4,7 +4,8 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict: if is_scored: - df = df[df[is_scored] == "Y"] # df does not have the isscored + df = df[df[is_scored] == "Y"] + # If isscored is blank/NaN, take all rows with at least one "Y" in any isscored column else: df = df[df.isin(["Y"]).any(axis=1)] @@ -26,12 +27,14 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) - distribution, bins=10, precision=3, include_lowest=True, right=True ).value_counts(sort=False) ) - obj["distribution"][ - 0 - ] -= 1 # since this was calculated with the artificial 0 value, we subtract it - obj["distribution"][ - -1 - ] -= 1 # since this was calculated with the artificial upper_bound, we subtract it + + # obj["distribution"][0] is for the lowest bin, which includes values of 0. Since this was + # calculated with an extra artificial 0 value, we subtract 1 to get the real count. + obj["distribution"][0] -= 1 + + # obj["distribution"][-1] (end of the list) is for the highest bin, which includes the upper + # bound. Since this was calculated with an extra artificial upper_bound, we subtract 1 as above. + obj["distribution"][-1] -= 1 discard, obj["bins"] = list( pd.cut(distribution, bins=10, precision=3, retbins=True) From 2abf571b369ada2af5c9dd5b7829fde937fbd68e Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 6 Jul 2023 16:01:57 -0700 Subject: [PATCH 2/6] Added data-driven test for distribution_data transform --- .../test_distribution_data_good_input.csv | 27 ++ .../test_distribution_data_missing_input.csv | 31 +++ ...ribution_data_wrong_data_type_genetics.csv | 5 + ...istribution_data_wrong_data_type_omics.csv | 5 + ...tribution_data_wrong_data_type_overall.csv | 5 + .../distribution_data_good_output_1.json | 260 ++++++++++++++++++ .../distribution_data_good_output_2.json | 260 ++++++++++++++++++ .../distribution_data_missing_output.json | 260 ++++++++++++++++++ tests/transform/test_distribution_data.py | 118 ++++++++ 9 files changed, 971 insertions(+) create mode 100644 tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv create mode 100644 tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv create mode 100644 tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv create mode 100644 tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv create mode 100644 tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv create mode 100644 tests/test_assets/distribution_data/output/distribution_data_good_output_1.json create mode 100644 tests/test_assets/distribution_data/output/distribution_data_good_output_2.json create mode 100644 tests/test_assets/distribution_data/output/distribution_data_missing_output.json create mode 100644 tests/transform/test_distribution_data.py diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv b/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv new file mode 100644 index 00000000..2dc0119e --- /dev/null +++ b/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv @@ -0,0 +1,27 @@ +,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +376410_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,1.128843106,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N,N +386108_438,,ENSG00000271743,2.016073569,10997,,,,,,,,,,,,,,,,,,N,N,N,N +386671_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,Y,Y +389129_442,HDAC11,ENSG00000163517,1.634292658,14029,1.634292658,0,1.260049115,0.068929196,1,1,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","Clinical data, evidence of tolerable safety profile in desired modality; target has a drug in phase IV in the appropriate modality, with good safety profile.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +388638_441,,ENSG00000277020,1.694933378,13537,0.647214232,1.047719146,0,0,,,,,,,,,,,,,,Y,Y,N,N +385369_437,MED29,ENSG00000063322,2.121700201,10256,1.05077206,1.070928141,0,0,7,4,5,6,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +390905_444,NRG3-AS1,ENSG00000225738,1.411592739,15807,0.501247906,0.910344833,0,0,,,,,,,,,,,,,,Y,Y,N,N +384723_437,,ENSG00000285582,2.220850217,9610,0.686138318,1.534711899,0,0,,,,,,,,,,,,,,Y,Y,N,N +394347_447,,ENSG00000225017,1.013299554,19263,0.580539456,0.432760098,0,0,,,,,,,,,,,,,,Y,Y,N,N +388040_440,NLGN1-AS1,ENSG00000228213,1.765755814,12936,0.7067041,1.059051714,0,0,,,,,,,,,,,,,,Y,Y,N,N +376866_427,TPM1,ENSG00000140416,3.546033928,1711,1.725863894,1.820170033,0.760263593,0.257244846,12,4,3,3,4,2,Tbio,Potentially low ligandability: Has a structure but there is no evidence of a druggable pocket.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +383071_435,SAMD1,ENSG00000141858,2.49570642,7952,0.984364579,1.511341841,0,0.1100007,7,3,6,4,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Low target qualification. Does not adequately support target to progress. No clear path forward. Park and watch literature for developments.,"Limited evidence that target is a secreted protein, ECM component or cell membrane-bound protein. ",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,Y +376224_426,NSUN2,ENSG00000037474,3.719834306,1063,2.727519399,0.992314906,1.260049115,0,7,4,4,6,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +394277_447,MAB21L1,ENSG00000180660,1.026548266,19193,1.026548266,0,0,0,3,4,3,6,4,4,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues.,Y,Y,Y,Y +398381_451,COQ10A,ENSG00000135469,0.610112255,23299,0.610112255,0,0,0,13,2,5,4,4,2,Tdark,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Limited evidence that target is a secreted protein, ECM component or cell membrane-bound protein. ",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +387383_440,,ENSG00000287306,1.847974229,12278,0.760978821,1.086995408,0,0,,,,,,,,,,,,,,Y,Y,N,N +381435_433,POGZ,ENSG00000143442,2.791210607,6303,2.019822476,0.771388131,0,0,7,4,4,3,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +378852_429,XPO1,ENSG00000082898,3.203269035,3707,1.810349845,1.39291919,0.760263593,0,3,2,3,6,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +377326_427,DLST,ENSG00000119689,3.455978293,2170,1.69390856,1.762069732,0.944644391,0,11,4,5,3,4,2,Tbio,"Druggable protein class, no other information: Is a member of a PHAROS druggable class of protein (enzyme, receptor, ion channel, nuclear hormone receptor, kinase) but does not meet any of the criteria above.","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +394150_447,CPB1,ENSG00000153002,1.044579816,19066,1.044579816,0,0,0,1,4,3,1,4,4,Tchem,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Secreted protein. Highly accessible to antibody-based therapies.,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues.,Y,N,N,N +395167_448,RALGAPA1P1,ENSG00000229419,0.904383679,20084,0.670119353,0.234264326,0,0,,,,,,,,,,,,,,Y,Y,N,N +375407_425,IGF1R,ENSG00000140443,4.0940858,241,2.117549261,1.97653654,1.529966061,0,1,2,2,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",High target qualification but medium feasibility (some gaps) – Tier B for drug development. Proceed while working to address gaps.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +376472_426,IFITM2,ENSG00000185201,3.641792765,1314,2.129965595,1.51182717,1.391682716,0.357323438,13,4,4,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y \ No newline at end of file diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv b/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv new file mode 100644 index 00000000..9af538c7 --- /dev/null +++ b/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv @@ -0,0 +1,31 @@ +,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",,Y,Y,Y +378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,,Y,Y +389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,,N +376410_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,1.128843106,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N, +386108_438,,ENSG00000271743,2.016073569,10997,,,,,,,,,,,,,,,,,,N,N,N,N +386671_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,Y,Y +378979_429,HIVEP2,ENSG00000010818,3.185141534,3832,,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +386672_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,,0.021501052,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +386673_439,EVA1C,ENSG00000166979,1.942294708,11562,0.850712097,1.091582611,,0,13,2,5,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +376411_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",N,N,N,Y +389129_442,HDAC11,ENSG00000163517,,14029,1.634292658,0,1.260049115,0.068929196,1,1,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","Clinical data, evidence of tolerable safety profile in desired modality; target has a drug in phase IV in the appropriate modality, with good safety profile.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +388638_441,,ENSG00000277020,1.694933378,13537,0.647214232,1.047719146,0,0,,,,,,,,,,,,,,Y,Y,N,N +385369_437,MED29,ENSG00000063322,2.121700201,10256,1.05077206,1.070928141,0,0,7,4,5,6,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +390905_444,NRG3-AS1,ENSG00000225738,1.411592739,15807,0.501247906,0.910344833,0,0,,,,,,,,,,,,,,Y,Y,N,N +384723_437,,ENSG00000285582,2.220850217,9610,0.686138318,1.534711899,0,0,,,,,,,,,,,,,,Y,Y,N,N +394347_447,,ENSG00000225017,1.013299554,19263,0.580539456,0.432760098,0,0,,,,,,,,,,,,,,Y,Y,N,N +388040_440,NLGN1-AS1,ENSG00000228213,1.765755814,12936,0.7067041,1.059051714,0,0,,,,,,,,,,,,,,Y,Y,N,N +376866_427,TPM1,ENSG00000140416,3.546033928,1711,1.725863894,1.820170033,0.760263593,0.257244846,12,4,3,3,4,2,Tbio,Potentially low ligandability: Has a structure but there is no evidence of a druggable pocket.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +383071_435,SAMD1,ENSG00000141858,2.49570642,7952,0.984364579,1.511341841,0,0.1100007,7,3,6,4,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Low target qualification. Does not adequately support target to progress. No clear path forward. Park and watch literature for developments.,"Limited evidence that target is a secreted protein, ECM component or cell membrane-bound protein. ",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,Y +376224_426,NSUN2,ENSG00000037474,3.719834306,1063,2.727519399,0.992314906,1.260049115,0,7,4,4,6,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +394277_447,MAB21L1,ENSG00000180660,1.026548266,19193,1.026548266,0,0,0,3,4,3,6,4,4,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues.,Y,Y,Y,Y +398381_451,COQ10A,ENSG00000135469,0.610112255,23299,0.610112255,0,0,0,13,2,5,4,4,2,Tdark,Unknown: There is no information on ligands or structure in any of the categories above.,"No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Limited evidence that target is a secreted protein, ECM component or cell membrane-bound protein. ",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +387383_440,,ENSG00000287306,1.847974229,12278,0.760978821,1.086995408,0,0,,,,,,,,,,,,,,Y,Y,N,N +381435_433,POGZ,ENSG00000143442,2.791210607,6303,2.019822476,0.771388131,0,0,7,4,4,3,4,2,Tbio,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +378852_429,XPO1,ENSG00000082898,3.203269035,3707,1.810349845,1.39291919,0.760263593,0,3,2,3,6,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +377326_427,DLST,ENSG00000119689,3.455978293,2170,1.69390856,1.762069732,0.944644391,0,11,4,5,3,4,2,Tbio,"Druggable protein class, no other information: Is a member of a PHAROS druggable class of protein (enzyme, receptor, ion channel, nuclear hormone receptor, kinase) but does not meet any of the criteria above.","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +394150_447,CPB1,ENSG00000153002,1.044579816,19066,1.044579816,0,0,0,1,4,3,1,4,4,Tchem,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Secreted protein. Highly accessible to antibody-based therapies.,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues.,Y,N,N,N +395167_448,RALGAPA1P1,ENSG00000229419,0.904383679,20084,0.670119353,0.234264326,0,0,,,,,,,,,,,,,,Y,Y,N,N +375407_425,IGF1R,ENSG00000140443,4.0940858,241,2.117549261,1.97653654,1.529966061,0,1,2,2,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.","No major issues found from gene expression, genetic or pharmacological profiling, but has not been extensively tested in humans.",High target qualification but medium feasibility (some gaps) – Tier B for drug development. Proceed while working to address gaps.,"Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +376472_426,IFITM2,ENSG00000185201,3.641792765,1314,2.129965595,1.51182717,1.391682716,0.357323438,13,4,4,3,4,2,Tbio,Unknown: There is no information on ligands or structure in any of the categories above.,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y \ No newline at end of file diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv new file mode 100644 index 00000000..dbe5a717 --- /dev/null +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv @@ -0,0 +1,5 @@ +,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +378977_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.20055564,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +378978_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,blank,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +376410_426,SPR,ENSG00000116096,3.66345432,1251,1.69516494,1.96828938,1.55615776,1.12884311,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N \ No newline at end of file diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv new file mode 100644 index 00000000..02eb24e7 --- /dev/null +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv @@ -0,0 +1,5 @@ +,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +378977_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.20055564,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +378978_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +376410_426,SPR,ENSG00000116096,3.66345432,1251,1.69516494,blank,1.55615776,1.12884311,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N \ No newline at end of file diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv new file mode 100644 index 00000000..350be113 --- /dev/null +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv @@ -0,0 +1,5 @@ +,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y +389156_442,ATP1B3,ENSG00000069849,blank,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N +376410_426,SPR,ENSG00000116096,3.663454319,1251,1.695164935,1.968289384,1.55615776,1.128843106,3,4,3,5,4,2,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Protein located in the cytosol. Not practically accessible to antibody-based therapies, but may be more easily accessible to other modalities.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N \ No newline at end of file diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json new file mode 100644 index 00000000..9006c0a6 --- /dev/null +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json @@ -0,0 +1,260 @@ +[ + { + "target_risk_score": { + "distribution": [ + 0, + 2, + 4, + 6, + 3, + 1, + 4, + 3, + 1, + 0 + ], + "bins": [ + [ + 0, + 0.5 + ], + [ + 0.5, + 1.0 + ], + [ + 1.0, + 1.5 + ], + [ + 1.5, + 2.0 + ], + [ + 2.0, + 2.5 + ], + [ + 2.5, + 3.0 + ], + [ + 3.0, + 3.5 + ], + [ + 3.5, + 4.0 + ], + [ + 4.0, + 4.5 + ], + [ + 4.5, + 5.0 + ] + ], + "min": 0.6101, + "max": 4.0941, + "mean": 2.2578, + "first_quartile": 2.0, + "third_quartile": 3.0, + "name": "Target Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621071" + }, + "genetics_score": { + "distribution": [ + 0, + 2, + 6, + 4, + 0, + 6, + 2, + 2, + 0, + 1 + ], + "bins": [ + [ + 0, + 0.3 + ], + [ + 0.3, + 0.6 + ], + [ + 0.6, + 0.9 + ], + [ + 0.9, + 1.2 + ], + [ + 1.2, + 1.5 + ], + [ + 1.5, + 1.8 + ], + [ + 1.8, + 2.1 + ], + [ + 2.1, + 2.4 + ], + [ + 2.4, + 2.7 + ], + [ + 2.7, + 3.0 + ] + ], + "min": 0.5012, + "max": 2.7275, + "mean": 1.3151, + "first_quartile": 1.0, + "third_quartile": 2.0, + "name": "Genetic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621069" + }, + "multi_omics_score": { + "distribution": [ + 4, + 1, + 1, + 1, + 2, + 4, + 1, + 5, + 1, + 2 + ], + "bins": [ + [ + 0, + 0.2 + ], + [ + 0.2, + 0.4 + ], + [ + 0.4, + 0.6 + ], + [ + 0.6, + 0.8 + ], + [ + 0.8, + 1.0 + ], + [ + 1.0, + 1.2 + ], + [ + 1.2, + 1.4 + ], + [ + 1.4, + 1.6 + ], + [ + 1.6, + 1.8 + ], + [ + 1.8, + 2.0 + ] + ], + "min": 0.0, + "max": 1.9765, + "mean": 0.9999, + "first_quartile": 1.0, + "third_quartile": 1.0, + "name": "Multi-omic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621070" + }, + "literature_score": { + "distribution": [ + 3, + 0, + 0, + 2, + 1, + 0, + 5, + 1, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.2 + ], + [ + 0.2, + 0.4 + ], + [ + 0.4, + 0.6 + ], + [ + 0.6, + 0.8 + ], + [ + 0.8, + 1.0 + ], + [ + 1.0, + 1.2 + ], + [ + 1.2, + 1.4 + ], + [ + 1.4, + 1.6 + ], + [ + 1.6, + 1.8 + ], + [ + 1.8, + 2.0 + ] + ], + "min": 0.0, + "max": 1.53, + "mean": 0.8707, + "first_quartile": 0.0, + "third_quartile": 1.0, + "name": "Literature Score", + "syn_id": "syn25913473", + "wiki_id": "613105" + } + } +] \ No newline at end of file diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json new file mode 100644 index 00000000..807f5583 --- /dev/null +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json @@ -0,0 +1,260 @@ +[ + { + "target_risk_score": { + "distribution": [ + 1, + 5, + 8, + 4, + 5, + 1, + 0, + 0, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.8 + ], + [ + 0.8, + 1.6 + ], + [ + 1.6, + 2.4 + ], + [ + 2.4, + 3.2 + ], + [ + 3.2, + 4.0 + ], + [ + 4.0, + 4.8 + ], + [ + 4.8, + 5.6 + ], + [ + 5.6, + 6.4 + ], + [ + 6.4, + 7.2 + ], + [ + 7.2, + 8.0 + ] + ], + "min": 0.6101, + "max": 4.0941, + "mean": 2.2578, + "first_quartile": 2.0, + "third_quartile": 3.0, + "name": "Target Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621071" + }, + "genetics_score": { + "distribution": [ + 2, + 10, + 6, + 4, + 1, + 0, + 0, + 0, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.6 + ], + [ + 0.6, + 1.2 + ], + [ + 1.2, + 1.8 + ], + [ + 1.8, + 2.4 + ], + [ + 2.4, + 3.0 + ], + [ + 3.0, + 3.6 + ], + [ + 3.6, + 4.2 + ], + [ + 4.2, + 4.8 + ], + [ + 4.8, + 5.4 + ], + [ + 5.4, + 6.0 + ] + ], + "min": 0.5012, + "max": 2.7275, + "mean": 1.3151, + "first_quartile": 1.0, + "third_quartile": 2.0, + "name": "Genetic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621069" + }, + "multi_omics_score": { + "distribution": [ + 6, + 3, + 7, + 6, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.5 + ], + [ + 0.5, + 1.0 + ], + [ + 1.0, + 1.5 + ], + [ + 1.5, + 2.0 + ], + [ + 2.0, + 2.5 + ], + [ + 2.5, + 3.0 + ], + [ + 3.0, + 3.5 + ], + [ + 3.5, + 4.0 + ], + [ + 4.0, + 4.5 + ], + [ + 4.5, + 5.0 + ] + ], + "min": 0.0, + "max": 1.9765, + "mean": 0.9999, + "first_quartile": 1.0, + "third_quartile": 1.0, + "name": "Multi-omic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621070" + }, + "literature_score": { + "distribution": [ + 3, + 3, + 5, + 1, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.5 + ], + [ + 0.5, + 1.0 + ], + [ + 1.0, + 1.5 + ], + [ + 1.5, + 2.0 + ], + [ + 2.0, + 2.5 + ], + [ + 2.5, + 3.0 + ], + [ + 3.0, + 3.5 + ], + [ + 3.5, + 4.0 + ], + [ + 4.0, + 4.5 + ], + [ + 4.5, + 5.0 + ] + ], + "min": 0.0, + "max": 1.53, + "mean": 0.8707, + "first_quartile": 0.0, + "third_quartile": 1.0, + "name": "Literature Score", + "syn_id": "syn25913473", + "wiki_id": "613105" + } + } +] \ No newline at end of file diff --git a/tests/test_assets/distribution_data/output/distribution_data_missing_output.json b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json new file mode 100644 index 00000000..f6ba953f --- /dev/null +++ b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json @@ -0,0 +1,260 @@ +[ + { + "target_risk_score": { + "distribution": [ + 0, + 2, + 4, + 7, + 3, + 1, + 5, + 3, + 1, + 0 + ], + "bins": [ + [ + 0, + 0.5 + ], + [ + 0.5, + 1.0 + ], + [ + 1.0, + 1.5 + ], + [ + 1.5, + 2.0 + ], + [ + 2.0, + 2.5 + ], + [ + 2.5, + 3.0 + ], + [ + 3.0, + 3.5 + ], + [ + 3.5, + 4.0 + ], + [ + 4.0, + 4.5 + ], + [ + 4.5, + 5.0 + ] + ], + "min": 0.6101, + "max": 4.0941, + "mean": 2.2932, + "first_quartile": 2.0, + "third_quartile": 3.0, + "name": "Target Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621071" + }, + "genetics_score": { + "distribution": [ + 0, + 2, + 8, + 4, + 0, + 5, + 2, + 2, + 0, + 1 + ], + "bins": [ + [ + 0, + 0.3 + ], + [ + 0.3, + 0.6 + ], + [ + 0.6, + 0.9 + ], + [ + 0.9, + 1.2 + ], + [ + 1.2, + 1.5 + ], + [ + 1.5, + 1.8 + ], + [ + 1.8, + 2.1 + ], + [ + 2.1, + 2.4 + ], + [ + 2.4, + 2.7 + ], + [ + 2.7, + 3.0 + ] + ], + "min": 0.5012, + "max": 2.7275, + "mean": 1.2585, + "first_quartile": 1.0, + "third_quartile": 2.0, + "name": "Genetic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621069" + }, + "multi_omics_score": { + "distribution": [ + 4, + 1, + 1, + 1, + 2, + 5, + 1, + 5, + 1, + 2 + ], + "bins": [ + [ + 0, + 0.2 + ], + [ + 0.2, + 0.4 + ], + [ + 0.4, + 0.6 + ], + [ + 0.6, + 0.8 + ], + [ + 0.8, + 1.0 + ], + [ + 1.0, + 1.2 + ], + [ + 1.2, + 1.4 + ], + [ + 1.4, + 1.6 + ], + [ + 1.6, + 1.8 + ], + [ + 1.8, + 2.0 + ] + ], + "min": 0.0, + "max": 1.9765, + "mean": 1.0039, + "first_quartile": 1.0, + "third_quartile": 1.0, + "name": "Multi-omic Risk Score", + "syn_id": "syn25913473", + "wiki_id": "621070" + }, + "literature_score": { + "distribution": [ + 4, + 0, + 0, + 2, + 1, + 0, + 6, + 1, + 0, + 0 + ], + "bins": [ + [ + 0, + 0.2 + ], + [ + 0.2, + 0.4 + ], + [ + 0.4, + 0.6 + ], + [ + 0.6, + 0.8 + ], + [ + 0.8, + 1.0 + ], + [ + 1.0, + 1.2 + ], + [ + 1.2, + 1.4 + ], + [ + 1.4, + 1.6 + ], + [ + 1.6, + 1.8 + ], + [ + 1.8, + 2.0 + ] + ], + "min": 0.0, + "max": 1.53, + "mean": 0.8379, + "first_quartile": 0.0, + "third_quartile": 1.0, + "name": "Literature Score", + "syn_id": "syn25913473", + "wiki_id": "613105" + } + } +] \ No newline at end of file diff --git a/tests/transform/test_distribution_data.py b/tests/transform/test_distribution_data.py new file mode 100644 index 00000000..432c835b --- /dev/null +++ b/tests/transform/test_distribution_data.py @@ -0,0 +1,118 @@ +import os +import pandas as pd +import json +import pytest + +from agoradatatools.etl.transform import distribution_data + + +class TestTransformDistributionData: + data_files_path = "tests/test_assets/distribution_data" + param_set_1 = { + "overall_max_score": 5, + "genetics_max_score": 3, + "omics_max_score": 2, + "lit_max_score": 2, + } + param_set_2 = { + "overall_max_score": 8, + "genetics_max_score": 6, + "omics_max_score": 5, + "lit_max_score": 5, + } + + pass_test_data = [ + ( # Pass with good data on param set 1 + "test_distribution_data_good_input.csv", + "distribution_data_good_output_1.json", + param_set_1, + ), + ( # Pass with good data on param set 2 + "test_distribution_data_good_input.csv", + "distribution_data_good_output_2.json", + param_set_2, + ), + ( # Pass with values missing from each column + "test_distribution_data_missing_input.csv", + "distribution_data_missing_output.json", + param_set_1, + ), + ] + pass_test_ids = [ + "Pass with good data on parameter set 1", + "Pass with good data on parameter set 2", + "Pass with missing values in each column", + ] + fail_test_data = [ + ( # Bad data type + "test_distribution_data_wrong_data_type_overall.csv", + param_set_1, + ValueError, + ), + ( # Bad data type + "test_distribution_data_wrong_data_type_genetics.csv", + param_set_1, + ValueError, + ), + ( # Bad data type + "test_distribution_data_wrong_data_type_omics.csv", + param_set_1, + ValueError, + ), + ] + fail_test_ids = [ + "Fail with bad data type in overall score column", + "Fail with bad data type in genetics score column", + "Fail with bad data type in omics score column", + ] + + @pytest.mark.parametrize( + "distribution_data_file, expected_output_file, param_set", + pass_test_data, + ids=pass_test_ids, + ) + def test_transform_distribution_data_should_pass( + self, distribution_data_file, expected_output_file, param_set + ): + distribution_data_df = pd.read_csv( + os.path.join(self.data_files_path, "input", distribution_data_file), + index_col=0, + ) + output_dict = distribution_data.transform_distribution_data( + datasets={"overall_scores": distribution_data_df}, + overall_max_score=param_set["overall_max_score"], + genetics_max_score=param_set["genetics_max_score"], + omics_max_score=param_set["omics_max_score"], + lit_max_score=param_set["lit_max_score"], + ) + + # Writing to JSON changes "bins" from tuples to lists, so output_dict and expected_dict + # would not be equal since expected_dict is read from JSON. We solve this by turning + # output_dict into a JSON string and reading back into a dict. + output_dict = json.loads(json.dumps(output_dict)) + + json_file = os.path.join(self.data_files_path, "output", expected_output_file) + with open(json_file) as file: + expected_dict = json.load(file)[0] + assert output_dict == expected_dict + + @pytest.mark.parametrize( + "distribution_data_file, param_set, error_type", + fail_test_data, + ids=fail_test_ids, + ) + def test_transform_distribution_data_should_fail( + self, distribution_data_file, param_set, error_type + ): + with pytest.raises(error_type): + distribution_data_df = pd.read_csv( + os.path.join(self.data_files_path, "input", distribution_data_file), + index_col=0, + ) + distribution_data.transform_distribution_data( + datasets={"overall_scores": distribution_data_df}, + overall_max_score=param_set["overall_max_score"], + genetics_max_score=param_set["genetics_max_score"], + omics_max_score=param_set["omics_max_score"], + lit_max_score=param_set["lit_max_score"], + ) From 0f3d66d10645f8e24b161bea05cb697bd5491ddd Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 6 Jul 2023 16:30:02 -0700 Subject: [PATCH 3/6] AG-1116: Removed literature score from distribution_data transform and updated tests to match --- config.yaml | 1 - .../etl/transform/distribution_data.py | 8 +-- src/agoradatatools/process.py | 3 +- test_config.yaml | 1 - .../distribution_data_good_output_1.json | 68 +------------------ .../distribution_data_good_output_2.json | 68 +------------------ .../distribution_data_missing_output.json | 68 +------------------ tests/transform/test_distribution_data.py | 4 -- 8 files changed, 9 insertions(+), 212 deletions(-) diff --git a/config.yaml b/config.yaml index 750cda46..895c637e 100644 --- a/config.yaml +++ b/config.yaml @@ -223,7 +223,6 @@ overall_max_score: 5 genetics_max_score: 3 omics_max_score: 2 - lit_max_score: 2 provenance: - syn25575156.13 destination: *dest diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py index 7ce7c55a..c5a37b1a 100644 --- a/src/agoradatatools/etl/transform/distribution_data.py +++ b/src/agoradatatools/etl/transform/distribution_data.py @@ -62,7 +62,6 @@ def transform_distribution_data( overall_max_score, genetics_max_score, omics_max_score, - lit_max_score, ): overall_scores = datasets["overall_scores"] interesting_columns = [ @@ -70,11 +69,10 @@ def transform_distribution_data( "overall", "geneticsscore", "omicsscore", - "literaturescore", ] # create mapping to deal with missing values as they take different shape across the fields - scored = ["isscored_genetics", "isscored_omics", "isscored_lit"] + scored = ["isscored_genetics", "isscored_omics"] mapping = dict(zip(interesting_columns[2:], scored)) mapping["overall"] = None @@ -82,7 +80,7 @@ def transform_distribution_data( max_score = dict( zip( interesting_columns[1:], - [overall_max_score, genetics_max_score, omics_max_score, lit_max_score], + [overall_max_score, genetics_max_score, omics_max_score], ) ) @@ -97,13 +95,11 @@ def transform_distribution_data( neo_matrix["target_risk_score"] = neo_matrix.pop("overall") neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore") neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore") - neo_matrix["literature_score"] = neo_matrix.pop("literaturescore") additional_data = [ {"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"}, {"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"}, {"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"}, - {"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"}, ] for col, additional in zip(neo_matrix.keys(), additional_data): neo_matrix[col]["name"] = additional["name"] diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index f948e9f1..932304ed 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -30,8 +30,7 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: genetics_max_score=dataset_obj["custom_transformations"][ "genetics_max_score" ], - omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"], - lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"], + omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"] ) if dataset_name == "team_info": return transform.transform_team_info(datasets=datasets) diff --git a/test_config.yaml b/test_config.yaml index f4b2dc99..b7394b69 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -223,7 +223,6 @@ overall_max_score: 5 genetics_max_score: 3 omics_max_score: 2 - lit_max_score: 2 provenance: - syn25575156.13 destination: *dest diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json index 9006c0a6..2d9a71b7 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json @@ -5,7 +5,7 @@ 0, 2, 4, - 6, + 5, 3, 1, 4, @@ -57,7 +57,7 @@ ], "min": 0.6101, "max": 4.0941, - "mean": 2.2578, + "mean": 2.2715, "first_quartile": 2.0, "third_quartile": 3.0, "name": "Target Risk Score", @@ -191,70 +191,6 @@ "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" - }, - "literature_score": { - "distribution": [ - 3, - 0, - 0, - 2, - 1, - 0, - 5, - 1, - 0, - 0 - ], - "bins": [ - [ - 0, - 0.2 - ], - [ - 0.2, - 0.4 - ], - [ - 0.4, - 0.6 - ], - [ - 0.6, - 0.8 - ], - [ - 0.8, - 1.0 - ], - [ - 1.0, - 1.2 - ], - [ - 1.2, - 1.4 - ], - [ - 1.4, - 1.6 - ], - [ - 1.6, - 1.8 - ], - [ - 1.8, - 2.0 - ] - ], - "min": 0.0, - "max": 1.53, - "mean": 0.8707, - "first_quartile": 0.0, - "third_quartile": 1.0, - "name": "Literature Score", - "syn_id": "syn25913473", - "wiki_id": "613105" } } ] \ No newline at end of file diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json index 807f5583..1f304e3a 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json @@ -4,7 +4,7 @@ "distribution": [ 1, 5, - 8, + 7, 4, 5, 1, @@ -57,7 +57,7 @@ ], "min": 0.6101, "max": 4.0941, - "mean": 2.2578, + "mean": 2.2715, "first_quartile": 2.0, "third_quartile": 3.0, "name": "Target Risk Score", @@ -191,70 +191,6 @@ "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" - }, - "literature_score": { - "distribution": [ - 3, - 3, - 5, - 1, - 0, - 0, - 0, - 0, - 0, - 0 - ], - "bins": [ - [ - 0, - 0.5 - ], - [ - 0.5, - 1.0 - ], - [ - 1.0, - 1.5 - ], - [ - 1.5, - 2.0 - ], - [ - 2.0, - 2.5 - ], - [ - 2.5, - 3.0 - ], - [ - 3.0, - 3.5 - ], - [ - 3.5, - 4.0 - ], - [ - 4.0, - 4.5 - ], - [ - 4.5, - 5.0 - ] - ], - "min": 0.0, - "max": 1.53, - "mean": 0.8707, - "first_quartile": 0.0, - "third_quartile": 1.0, - "name": "Literature Score", - "syn_id": "syn25913473", - "wiki_id": "613105" } } ] \ No newline at end of file diff --git a/tests/test_assets/distribution_data/output/distribution_data_missing_output.json b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json index f6ba953f..164b8c22 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_missing_output.json +++ b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json @@ -5,7 +5,7 @@ 0, 2, 4, - 7, + 6, 3, 1, 5, @@ -57,7 +57,7 @@ ], "min": 0.6101, "max": 4.0941, - "mean": 2.2932, + "mean": 2.3072, "first_quartile": 2.0, "third_quartile": 3.0, "name": "Target Risk Score", @@ -191,70 +191,6 @@ "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" - }, - "literature_score": { - "distribution": [ - 4, - 0, - 0, - 2, - 1, - 0, - 6, - 1, - 0, - 0 - ], - "bins": [ - [ - 0, - 0.2 - ], - [ - 0.2, - 0.4 - ], - [ - 0.4, - 0.6 - ], - [ - 0.6, - 0.8 - ], - [ - 0.8, - 1.0 - ], - [ - 1.0, - 1.2 - ], - [ - 1.2, - 1.4 - ], - [ - 1.4, - 1.6 - ], - [ - 1.6, - 1.8 - ], - [ - 1.8, - 2.0 - ] - ], - "min": 0.0, - "max": 1.53, - "mean": 0.8379, - "first_quartile": 0.0, - "third_quartile": 1.0, - "name": "Literature Score", - "syn_id": "syn25913473", - "wiki_id": "613105" } } ] \ No newline at end of file diff --git a/tests/transform/test_distribution_data.py b/tests/transform/test_distribution_data.py index 432c835b..9987cfea 100644 --- a/tests/transform/test_distribution_data.py +++ b/tests/transform/test_distribution_data.py @@ -12,13 +12,11 @@ class TestTransformDistributionData: "overall_max_score": 5, "genetics_max_score": 3, "omics_max_score": 2, - "lit_max_score": 2, } param_set_2 = { "overall_max_score": 8, "genetics_max_score": 6, "omics_max_score": 5, - "lit_max_score": 5, } pass_test_data = [ @@ -83,7 +81,6 @@ def test_transform_distribution_data_should_pass( overall_max_score=param_set["overall_max_score"], genetics_max_score=param_set["genetics_max_score"], omics_max_score=param_set["omics_max_score"], - lit_max_score=param_set["lit_max_score"], ) # Writing to JSON changes "bins" from tuples to lists, so output_dict and expected_dict @@ -114,5 +111,4 @@ def test_transform_distribution_data_should_fail( overall_max_score=param_set["overall_max_score"], genetics_max_score=param_set["genetics_max_score"], omics_max_score=param_set["omics_max_score"], - lit_max_score=param_set["lit_max_score"], ) From 66b3012e069f8c4d41679777ac789079898bf464 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 6 Jul 2023 17:02:54 -0700 Subject: [PATCH 4/6] Made a comment in the distribution_data test more clear --- tests/transform/test_distribution_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/transform/test_distribution_data.py b/tests/transform/test_distribution_data.py index 9987cfea..de7fade5 100644 --- a/tests/transform/test_distribution_data.py +++ b/tests/transform/test_distribution_data.py @@ -83,9 +83,9 @@ def test_transform_distribution_data_should_pass( omics_max_score=param_set["omics_max_score"], ) - # Writing to JSON changes "bins" from tuples to lists, so output_dict and expected_dict - # would not be equal since expected_dict is read from JSON. We solve this by turning - # output_dict into a JSON string and reading back into a dict. + # Writing to JSON changes the "bins" entry in this dict from tuples to lists, so + # output_dict and expected_dict would not be equal since expected_dict is read from JSON. + # We solve this by turning output_dict into a JSON string and reading back into a dict. output_dict = json.loads(json.dumps(output_dict)) json_file = os.path.join(self.data_files_path, "output", expected_output_file) From cc069a6026923e55159d3e697c4b4590ab6d1730 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Mon, 10 Jul 2023 14:07:27 -0700 Subject: [PATCH 5/6] Bug fixes: distribution data transform now drops duplicate rows and rounds the quartiles to 4 decimal places --- .../etl/transform/distribution_data.py | 6 ++--- .../distribution_data_good_output_1.json | 24 +++++++++---------- .../distribution_data_good_output_2.json | 24 +++++++++---------- .../distribution_data_missing_output.json | 12 +++++----- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py index c5a37b1a..dbb0df8d 100644 --- a/src/agoradatatools/etl/transform/distribution_data.py +++ b/src/agoradatatools/etl/transform/distribution_data.py @@ -48,10 +48,10 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) - obj["max"] = np.around(df[col].max(), 4) obj["mean"] = np.around(df[col].mean(), 4) obj["first_quartile"] = np.around( - df[col].quantile(q=0.25, interpolation="midpoint") + df[col].quantile(q=0.25, interpolation="midpoint"), 4 ) obj["third_quartile"] = np.around( - df[col].quantile(q=0.75, interpolation="midpoint") + df[col].quantile(q=0.75, interpolation="midpoint"), 4 ) return obj @@ -84,7 +84,7 @@ def transform_distribution_data( ) ) - overall_scores = overall_scores[interesting_columns + scored] + overall_scores = overall_scores[interesting_columns + scored].drop_duplicates() neo_matrix = {} for col in interesting_columns[1:]: # excludes the ENSG diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json index 2d9a71b7..558aa7df 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_1.json @@ -8,7 +8,7 @@ 5, 3, 1, - 4, + 3, 3, 1, 0 @@ -57,9 +57,9 @@ ], "min": 0.6101, "max": 4.0941, - "mean": 2.2715, - "first_quartile": 2.0, - "third_quartile": 3.0, + "mean": 2.23, + "first_quartile": 1.521, + "third_quartile": 3.1942, "name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071" @@ -71,7 +71,7 @@ 6, 4, 0, - 6, + 5, 2, 2, 0, @@ -121,9 +121,9 @@ ], "min": 0.5012, "max": 2.7275, - "mean": 1.3151, - "first_quartile": 1.0, - "third_quartile": 2.0, + "mean": 1.2956, + "first_quartile": 0.6964, + "third_quartile": 1.735, "name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069" @@ -137,7 +137,7 @@ 2, 4, 1, - 5, + 4, 1, 2 ], @@ -185,9 +185,9 @@ ], "min": 0.0, "max": 1.9765, - "mean": 0.9999, - "first_quartile": 1.0, - "third_quartile": 1.0, + "mean": 0.9789, + "first_quartile": 0.4328, + "third_quartile": 1.5113, "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" diff --git a/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json index 1f304e3a..f47610f9 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json +++ b/tests/test_assets/distribution_data/output/distribution_data_good_output_2.json @@ -5,7 +5,7 @@ 1, 5, 7, - 4, + 3, 5, 1, 0, @@ -57,9 +57,9 @@ ], "min": 0.6101, "max": 4.0941, - "mean": 2.2715, - "first_quartile": 2.0, - "third_quartile": 3.0, + "mean": 2.23, + "first_quartile": 1.521, + "third_quartile": 3.1942, "name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071" @@ -68,7 +68,7 @@ "distribution": [ 2, 10, - 6, + 5, 4, 1, 0, @@ -121,9 +121,9 @@ ], "min": 0.5012, "max": 2.7275, - "mean": 1.3151, - "first_quartile": 1.0, - "third_quartile": 2.0, + "mean": 1.2956, + "first_quartile": 0.6964, + "third_quartile": 1.735, "name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069" @@ -132,7 +132,7 @@ "distribution": [ 6, 3, - 7, + 6, 6, 0, 0, @@ -185,9 +185,9 @@ ], "min": 0.0, "max": 1.9765, - "mean": 0.9999, - "first_quartile": 1.0, - "third_quartile": 1.0, + "mean": 0.9789, + "first_quartile": 0.4328, + "third_quartile": 1.5113, "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" diff --git a/tests/test_assets/distribution_data/output/distribution_data_missing_output.json b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json index 164b8c22..bf74113f 100644 --- a/tests/test_assets/distribution_data/output/distribution_data_missing_output.json +++ b/tests/test_assets/distribution_data/output/distribution_data_missing_output.json @@ -58,8 +58,8 @@ "min": 0.6101, "max": 4.0941, "mean": 2.3072, - "first_quartile": 2.0, - "third_quartile": 3.0, + "first_quartile": 1.6304, + "third_quartile": 3.1851, "name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071" @@ -122,8 +122,8 @@ "min": 0.5012, "max": 2.7275, "mean": 1.2585, - "first_quartile": 1.0, - "third_quartile": 2.0, + "first_quartile": 0.6964, + "third_quartile": 1.735, "name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069" @@ -186,8 +186,8 @@ "min": 0.0, "max": 1.9765, "mean": 1.0039, - "first_quartile": 1.0, - "third_quartile": 1.0, + "first_quartile": 0.6021, + "third_quartile": 1.4762, "name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070" From 7ab0fc0ce439d91c81fc779def6fd08b6abefb60 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Mon, 10 Jul 2023 14:45:13 -0700 Subject: [PATCH 6/6] Code cleanup: manual column renames in distribution data transform have been moved to column_rename in the config file. Tests have been updated to match --- config.yaml | 4 ++++ .../etl/transform/distribution_data.py | 14 +++++--------- test_config.yaml | 4 ++++ .../input/test_distribution_data_good_input.csv | 2 +- .../input/test_distribution_data_missing_input.csv | 2 +- ..._distribution_data_wrong_data_type_genetics.csv | 2 +- ...est_distribution_data_wrong_data_type_omics.csv | 2 +- ...t_distribution_data_wrong_data_type_overall.csv | 2 +- 8 files changed, 18 insertions(+), 14 deletions(-) diff --git a/config.yaml b/config.yaml index 895c637e..1bb0d7e4 100644 --- a/config.yaml +++ b/config.yaml @@ -225,6 +225,10 @@ omics_max_score: 2 provenance: - syn25575156.13 + column_rename: + overall: target_risk_score + geneticsscore: genetics_score + omicsscore: multi_omics_score destination: *dest - rna_distribution_data: diff --git a/src/agoradatatools/etl/transform/distribution_data.py b/src/agoradatatools/etl/transform/distribution_data.py index dbb0df8d..ac1f1b80 100644 --- a/src/agoradatatools/etl/transform/distribution_data.py +++ b/src/agoradatatools/etl/transform/distribution_data.py @@ -37,7 +37,7 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) - obj["distribution"][-1] -= 1 discard, obj["bins"] = list( - pd.cut(distribution, bins=10, precision=3, retbins=True) + pd.cut(distribution, bins=10, precision=3, include_lowest=True, right=True, retbins=True) ) obj["bins"] = np.around(obj["bins"].tolist()[1:], 2) base = [0, *obj["bins"][:-1]] @@ -66,15 +66,15 @@ def transform_distribution_data( overall_scores = datasets["overall_scores"] interesting_columns = [ "ensg", - "overall", - "geneticsscore", - "omicsscore", + "target_risk_score", + "genetics_score", + "multi_omics_score", ] # create mapping to deal with missing values as they take different shape across the fields scored = ["isscored_genetics", "isscored_omics"] mapping = dict(zip(interesting_columns[2:], scored)) - mapping["overall"] = None + mapping["target_risk_score"] = None # create mapping for max score values from config max_score = dict( @@ -92,10 +92,6 @@ def transform_distribution_data( overall_scores, col, mapping[col], max_score[col] ) - neo_matrix["target_risk_score"] = neo_matrix.pop("overall") - neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore") - neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore") - additional_data = [ {"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"}, {"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"}, diff --git a/test_config.yaml b/test_config.yaml index b7394b69..a68c8b84 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -225,6 +225,10 @@ omics_max_score: 2 provenance: - syn25575156.13 + column_rename: + overall: target_risk_score + geneticsscore: genetics_score + omicsscore: multi_omics_score destination: *dest - rna_distribution_data: diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv b/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv index 2dc0119e..e1fa4d27 100644 --- a/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv +++ b/tests/test_assets/distribution_data/input/test_distribution_data_good_input.csv @@ -1,4 +1,4 @@ -,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +,hgnc_gene_id,ensg,target_risk_score,overall_rank,genetics_score,multi_omics_score,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath 378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv b/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv index 9af538c7..fa8f7617 100644 --- a/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv +++ b/tests/test_assets/distribution_data/input/test_distribution_data_missing_input.csv @@ -1,4 +1,4 @@ -,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +,hgnc_gene_id,ensg,target_risk_score,overall_rank,genetics_score,multi_omics_score,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath 378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",,Y,Y,Y 378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,,Y,Y 389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,,N diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv index dbe5a717..7a1cae47 100644 --- a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_genetics.csv @@ -1,4 +1,4 @@ -,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +,hgnc_gene_id,ensg,target_risk_score,overall_rank,genetics_score,multi_omics_score,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath 378977_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.20055564,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 378978_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,blank,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv index 02eb24e7..481f2261 100644 --- a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_omics.csv @@ -1,4 +1,4 @@ -,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +,hgnc_gene_id,ensg,target_risk_score,overall_rank,genetics_score,multi_omics_score,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath 378977_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.20055564,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 378978_429,HIVEP2,ENSG00000010818,3.18514153,3832,1.74406224,1.44107929,1.26004912,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 389156_442,ATP1B3,ENSG00000069849,1.63040118,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N diff --git a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv index 350be113..1138493e 100644 --- a/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv +++ b/tests/test_assets/distribution_data/input/test_distribution_data_wrong_data_type_overall.csv @@ -1,4 +1,4 @@ -,hgnc_gene_id,ensg,overall,overall_rank,geneticsscore,omicsscore,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath +,hgnc_gene_id,ensg,target_risk_score,overall_rank,genetics_score,multi_omics_score,literaturescore,neuropathscore,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition,isscored_genetics,isscored_omics,isscored_lit,isscored_neuropath 378977_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.200555635,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 378978_429,HIVEP2,ENSG00000010818,3.185141534,3832,1.744062244,1.441079289,1.260049115,0.1,7,3,4,6,4,2,Tdark,Potentially targetable by protein family structure: is a member of a gene family which has a protein member with a druggable pocket in the protein structure.,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.",Protein located in intracellular compartment.,Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,Y,Y 389156_442,ATP1B3,ENSG00000069849,blank,14056,1.63040118,0,0,0,1,5,4,3,4,2,Tclin,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,"The target gene is not """"tissue enriched""""/""""tissue enhanced"""" in any tissue.",Y,Y,N,N