From e21faa2b32ab880daa89e37b05b8e3f50063190b Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 2 Feb 2024 17:15:34 -0800 Subject: [PATCH 01/13] Added gene_info test assets for the 'passing' scenario and wrote test code to test the passing scenario --- .../input/diff_exp_data_good_input.csv | 16 + .../input/druggability_good_input.csv | 6 + .../gene_info/input/eqtl_good_input.csv | 6 + .../input/gene_metadata_good_input.feather | Bin 0 -> 13202 bytes .../input/gene_metadata_unaltered.feather | Bin 0 -> 13154 bytes .../input/genes_biodomains_good_input.csv | 6 + .../gene_info/input/igap_good_input.csv | 6 + .../input/median_expression_good_input.csv | 11 + .../gene_info/input/proteomics_good_input.csv | 17 + .../input/proteomics_srm_good_input.csv | 7 + .../input/proteomics_tmt_good_input.csv | 7 + .../input/target_list_good_input.csv | 6 + .../input/tep_adi_info_good_input.csv | 6 + .../output/gene_info_good_output_1.json | 825 ++++++++++++++++++ .../output/gene_info_good_output_2.json | 825 ++++++++++++++++++ tests/transform/test_gene_info.py | 131 +++ 16 files changed, 1875 insertions(+) create mode 100644 tests/test_assets/gene_info/input/diff_exp_data_good_input.csv create mode 100644 tests/test_assets/gene_info/input/druggability_good_input.csv create mode 100644 tests/test_assets/gene_info/input/eqtl_good_input.csv create mode 100644 tests/test_assets/gene_info/input/gene_metadata_good_input.feather create mode 100644 tests/test_assets/gene_info/input/gene_metadata_unaltered.feather create mode 100644 tests/test_assets/gene_info/input/genes_biodomains_good_input.csv create mode 100644 tests/test_assets/gene_info/input/igap_good_input.csv create mode 100644 tests/test_assets/gene_info/input/median_expression_good_input.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_good_input.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_srm_good_input.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv create mode 100644 tests/test_assets/gene_info/input/target_list_good_input.csv create mode 100644 tests/test_assets/gene_info/input/tep_adi_info_good_input.csv create mode 100644 tests/test_assets/gene_info/output/gene_info_good_output_1.json create mode 100644 tests/test_assets/gene_info/output/gene_info_good_output_2.json create mode 100644 tests/transform/test_gene_info.py diff --git a/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv new file mode 100644 index 00000000..b4bcdbf9 --- /dev/null +++ b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv @@ -0,0 +1,16 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.38386144170829,-0.610418505477203,-0.157304377939378,1.0964718993346,-3.32039574535246,0.0009658465939259,0.0028700749497231,protein_coding,7,DOWN,CFTR,36.5864869618047,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.0960797874215311,0.0277369852074512,0.164422589635611,3.66470161931573,2.79371699517244,0.0054146957075407,0.0128646180744981,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,MAYO +,TCX,AD-CONTROL,ENSG00000001629,0.0688865504482834,-0.0336046266228221,0.171377727519389,6.83925693988808,1.3190719562984,0.187765592003833,0.266881156254369,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,MAYO +Diagnosis,,AD-CONTROL,ENSG00000001460,-0.0428271747433806,-0.134111094837718,0.0484567453509564,4.53063964529137,-0.922749954447199,0.356592428630043,0.451948815550479,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,MAYO +Diagnosis,TCX,,ENSG00000000419,-0.0106100593393075,-0.0731334456790523,0.0519133270004373,4.97936316930159,-0.335784216826404,0.737176767177833,0.798556808985985,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,MAYO +Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.0878179824082075,-0.182373477784594,0.0067375129681794,4.32089559628026,-1.82192928105604,0.0688901841106443,0.155209255305467,,20,NONE,DPM1,39.8497192789902,23689,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001629,0.0804858875345706,0.0073127133165788,0.153659061752562,7.93343801331035,2.16036740054897,0.0310645962980247,0.0847168839428919,protein_coding,,NONE,ANKIB1,36.8283894215301,155410,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.0342369800592941,-0.114230375069546,0.0457564149509574,3.8133273490137,-0.842377670501157,0.399870699901111,0.554357331277541,protein_coding,1,,STPG1,44.0903630539242,59936,,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.0286960913417103,-0.0945307456366793,0.0371385629532588,4.11960673387962,-0.856820989104849,0.391728896583518,0.546803082969529,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL, +Diagnosis,IFG,AD-CONTROL,ENSG00000001626,0.0258715927543007,-0.1276221394235,0.179365324932101,3.0102152746566,0.330569495815668,0.741062726962072,0.834121516755035,protein_coding,7,NONE,,36.5864869618047,250188,ALL,MSSM +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001626,-0.14255892208891,-0.34294790004963,0.0578300558718095,-0.301663671232398,-1.39443750667908,0.163313064777812,0.296117785659528,protein_coding,7,NONE,CFTR,36.5864869618047,250188,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.0882919008770653,-0.131304264580892,-0.0452795371732381,3.92713408775626,-4.02835272367389,5.78948411033998e-05,0.0006377563132996,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.0841212438799088,0.0354911663215749,0.132751321438243,6.65482116592828,3.39203333183632,0.0007048884218731,0.0045744978486257,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.0398165705036053,-0.0915728594627508,0.0119397184555402,4.01629214749823,-1.50874762983754,0.131493224324285,0.252297356550177,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.0281454867160997,-0.0700675860372825,0.013776612605083,2.93709783776462,-1.31934379342664,0.187178523092241,0.327003220657394,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,ROSMAP diff --git a/tests/test_assets/gene_info/input/druggability_good_input.csv b/tests/test_assets/gene_info/input/druggability_good_input.csv new file mode 100644 index 00000000..5c9269d7 --- /dev/null +++ b/tests/test_assets/gene_info/input/druggability_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition +ENSG00000000005,7,4,4,3,4,4,Tbio,,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues. +ENSG00000001036,1,3,5,1,4,2,Tchem,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",,Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge. ,Secreted protein. Highly accessible to antibody-based therapies.,Not specified suitable for degradation/inhibition by user.,"The target gene is not ""tissue enriched""/""tissue enhanced"" in any tissue." +ENSG00000000460,13,3,5,3,4,4,Tdark,Unknown: There is no information on ligands or structure in any of the categories above. ,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",,,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues. +ENSG00000000971,3,4,3,1,4,5,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Secreted protein. Highly accessible to antibody-based therapies.,,The target gene is enriched/enhanced in tissues marked by the user as being associated with a high risk of off-target engagement issues. +ENSG00000001084,1,5,3,3,4,2,,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user., diff --git a/tests/test_assets/gene_info/input/eqtl_good_input.csv b/tests/test_assets/gene_info/input/eqtl_good_input.csv new file mode 100644 index 00000000..9e49ca8f --- /dev/null +++ b/tests/test_assets/gene_info/input/eqtl_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,is_eqtl +ENSG00000000419,True +ENSG00000000971,True +ENSG00000001460,True +ENSG00000001626,True +ENSG00000161149,False diff --git a/tests/test_assets/gene_info/input/gene_metadata_good_input.feather b/tests/test_assets/gene_info/input/gene_metadata_good_input.feather new file mode 100644 index 0000000000000000000000000000000000000000..b7f8a1ce519148fb0e3ee8a406d64e992b7560d8 GIT binary patch literal 13202 zcmeHudwd(!o$v3=Ll0ZFWXE=#1d@q)#0JaO$d5Rb5XIKrw0&~O6k9o z`8%=!p6f(L z=t#)sO%pN4)od5hNb%ju;Waw{uJd>Gl5NPCukie0?Ru%+cUCiF=w|Btcc$gEY)W&^ zcMCb*UQ<46B(2nZ7xI~0;c$jEeSw~IdDk+!tEDtoL-C87^el=S*);!l)8t^dr|FUf zoq1<3P}tEk)8_eUJ*%H@qdSv)t%_XG)-~6%i_FP1rfp6k;7z%L zX4}?I-jyA??HE>;r>dQ8k+x_PSVVd6FU)LB0otC(^0V{b1nLm)1j_QH74OFbhrfHb z{MfuW> zqyLR^t@GuIV>k|Z)I67%IvCW7l@!+?9kJ`pW<2 z1FAaY6A87=hX6IAR<2nDiZzoc5udZ`R+I|h5kN`^HI&OVg7$bBUPrkeUb@`Qf=_w! zJHqx-ELVE|bvYOG%fr(72Pvia1*Q0&QhYhu=HVWUfroPW{iS%hd~a#M>G@)GdnsNn z|4*fOxj#LeFc8}3A2eHvm!gN`<@)v?m0ym(S^8iQokFy5{6K2zo!&0_xJWv zysn8W@`xiZE*I3euWRbxn|3x>g6NxjTaRnvR|zty--HX;_VP zbR)T^6F+MAxwKS0m)5AA-Dgru=bA7r?OIMP-75cy?!wp6pHh$7-SH50GS#XWyM0|^?n8tQp$vJvHWx?X% zMLPzEhsH)!#mKt&HtE8YP;5P&H@Og3tQka-R%RN7UC3+&#)^_LT+1<>!J%;_XIrin zwqS#BZriF%*hytZ%NS-saSJv|Wp!n~A`!s!Q1|3ANwwz2g$_yP`=H#H_D6VFhN@6H% zV`|Xd9P|$*X=Ra|<)cx@h7++6n@w36q#RG<9V4*`^?2H68`G9!WpriQ_%(H4L^~-4 z6U(}mZn0b=g^9utu3>$cruG0$!_3kh_ZoJPb;63$umZM`#r(wwe@weGu&b~D`fmC} zDxY+vb?XOUOqO&V<1yj1zY3xildAGz+IwUe^K?-r%?0#&=*@B>io6@W7I_$F1f0iK0kuFqzy&ksdn*CX zC{FdSR&CmF)$kA6FE5^8#{BjPmOO$bg8lVX(-lT9^%4r z|Al>uY1qxbs|L8_I0fLWlRFY~L4|o9hXW%EmlfW~;^8oMi ze*k>04+4J%M27wXA)pp$1Xcnk0?j}ehyq=}8sKm!M@s9$%gt5yjSt8BJLBs|)se}8 z(a}WzM7)1!cpwpr4}?_@z0#l#jtnP82ZrM@HLhCrjH*UsIm@)vWOycRPmIT-xvs7N zC30Oo;XyCec6q6$r-9}?$*^N5qp1zi$XK+kYiJ}s0s{BaoERU9PsIBM!+m3e;VrTC zu|%|*YGOLgHAW-qXe_ZY+P|T*JGwF2(H5N;iFd7zb&VxrBON1}qlP(?_Ct=QHgvA; z>Ra6w9UVVsV)LX(mzDR5Op);L*#Vjhof+$k$5)SzCsy;_(~g(ahSlAXcvqV`IW``y zr~N3<(G?%$tY!qGonv9Ovww7?e>9FghK)8N(;mFUM&t3(za$s`_!ssYmy5sTi!dNy z^GHNXJRni^1#p}&=--$Hm$q51K|&^gpD?>Y27j0!h>|bqucq6V(L3|)A>5h`5i8J zGibudWb&BFjE=7%W3Q!Gt1+DrdIiNXF4Ds($W^#avu;u0&~0qpM~GRB{hW4V_OdA> zrFAoUg!N+N%4JNtm+j4HN$3%_lG1yq#{Lz}EpNrbSot(&484SA;X>eNw=1)nqZD+^ zoTXsuQhEln?b>h9+{!j(EBNXTNaK_;T~NmLnF;+u=oxwr4%o<=ZEgKUjmRo%M4#pi zXHK@D*)$_7oCuF<*%k%E-k}N9DkM@?*0MuPXrcWD6XXG1BoE55E)otv`^dwOgZV&~tZOVw2+}=#`g`7*OdDGp(Uad&!GZ>SxgKJvzQzWi(+D7Q0AIt9)vLFpU$0=n7oj68XA=}X}%wsowhuCk#n)D|{_A7QZs*1KL zqj^^qSRbo*Tcov`tv86Th)_?k{|soDSWd^XQ~L9a9THWjqeCP_#IFtOZA#2UyIE}y z+m(az3XSY;c6kO9X5>u$5gO;JoqMym4*s)G3DQkt)()(!+O@3>rua)j5Zpw3P~1{f zBI~gNGgjVK)(^xc=QX`Foj5t$M{=!j^I;X%*+BGN))04?sIxtM`&|T0t+gO^4vdc` z1nxZetT}IxS~1@wSpm=O|+_ zb{2M}?rfD)nF2PkyGEczY%-^r8Kai4eKiGN*-Y5N)QR=c1475d`i@3g}`^VgeliH(rNt(NlZzc1!>{vuc$wb$@fVtDXvWAU!w<G6u)%^ia6*<m zicNaz67L`=ru80yCaBBNGN-XbEBMHchpAW3Dn`9`9hms_9uHQ_yxy~-=Mf>$IDRI} z#jOnJE@OE^u@;Z;HEFzgJRWN~*Lz{o%!3u$I;%d_&sv37WOpaH9k?HMY|)-@;FLRs z(YTaSnkVApEjI|XSg822FX8A(TfbC}!GyAi?VC0&D+TtYYZ>TCH+lD9H-ZJAve$eO ztiZDZ3sbHAV-lb6G$bFVdhKyy*9!?uUef)(z`no|Mi%DQi{9QDH2K%QTyj>+8cs&` zO@UL}*7PjZ8C62ad6Wg~Jc8#&7Kr#Cn8l!qhF&;t|H-$~DfCtR!4EFDe$zw?WJ?P2 z&$e*5q|6p_)^~}u(NtE?A?-S$p%9RLOFSzL>v0badRFKg1U8j2W?)S|!=yJQ--r-F zANI?NZa?h}ZuLl!jY+poZd5l4UO}MG_+FDW$gy`=c(9t5RbrDGzI!lqM^Pi;Xyl|0 znGP`U`F1ZoEV3{9J*SayA3N1>*Gj=FWssmZhYmtan@@R8h_1Oo7S>DJybWthC(P=r zC{&Ou#gCKlE$UTUyTft?{oDuT5_01L%gDZ3NnfE}u-N>q-Z=_GWGQ1|Fk-i0rp;4H z(J}{Rag5j}d|Y9@g-vBF10wb(e$np}J+kzrO7V5yD#Q5~VIP-c>mjp~*7UV(CyYT8 z2H(#l)lz2P_N`27j|s#o)ue9sdxKATUlO`4mNgwIhhEQR)h+MrTE!snaCn;^}hOc{_V zIbZPeEJ&NA15Vya>Nyw6k_oM3jo@SoU^{l|U!;|q{hDV*+rI_`@ij3Nc}RVZ)5Co zFZ)deeP8b7&V%!+>3;=d@Mfe=^-aX%vDoC4j~?{V6D*Xo8GY8H!uN5Hj}BBaL2lCB zRSf%9hq^Y=Ei}ZwUMc>oPerT!LMU|?F;E&5jnsogQ&iJ8E7Zj|3&Jx#y1r7$rawy3 zZ>aLi6zw029al-er8>nIlB$B6w3!(r`>Z5?G$5@a_IV%ek=cHdCM6-x*bn^u87-xE zR@KFBpvzjZuwbap4J4jjS*yJ&R6i}ybG`&5RmynY7ixV|sIebnA^jKC6s)=mq7Da5 zZulA&(!Nm5R*LjR#vHG7LMQ?y>J0zxRz)|UY}hxF_&bU6!KkgJI!UqYgDmD^&rLCU zyh>=OI;%|?%-g4tZyH1Mi>p0CPVk=Q7hd$xC?*>QrH7uDp)s*2C}rnH=ubFGJ@ zDReQZJOAVhrP3@ICf3LOOzB#>L!?~+w!AXQojtntV{x(JQq?xv-KHe4(T(KaNy3jq z(j8K>k^L}<(Mq%EILe>y>5ZADaju8%W)W!8&q}#*-GTa$yt(rFSk8u;5shxUfD%yo zrp;T0P|tw$B23<*hP$-kGH+*f`>~ph+`M^}T5p;*oaU}oVGz=L!QMmy2x@v`7aVu-z5 zIRFotHj6_I7Rlx9+k7$Z47)?1Z56_!-kRw|t$3E`U&1!&yI??^Q_-aFy3R{uBwZ;4 z|41jWdjl+8CvEl#->>#GhJLlcA2vKZd{qTK&*T>Z!bfW9L_s`RraR=7n)`z6`=fBR zwhLawblymPTnwc=OiL?SJ#T+X4y88MD^~8SRrIn?x|%$9)QDdSk|FuDP-H^STwF(i zMYN()_*i|yN}Ja76Afo65VW~Jcp9~z)l5ulj*CZ|1FvD8! zYhb^t5O##9s*WDSw`WLGF)D!Z4A3nbe>cxUeIeVWqd6?j=#a9R5Ei z4^qFXmTq8Df;>K}yB)I-{Mi7qy?s!tVvCj>ffV=<7fCQ>-GCJITlPH40aH`@rbp ze?`LURl+yDCm7D+M&T`QBw0vmarKuUm`rl*d|&zW8dBoa)+cu zGMTsSyICl6b`?!CT3a93Ls$3frmGEhMSA8!(k-P4_j%~U_pK6W998eBLj z2x7KExLS~70#z~bA8N;ozVZI0!B5o+ghREpei4kc(^okwOQ$UnudZIRdRyC{y<)E~ zk)Ou2Z>XN)uI3B!Z!Gd`s+Q_SX{(35EqbCPZj$NvYPz*rI+?29_WDdY_zl0rMhJuk zN3=Oj-fNdKh3Ga4e99C0&`lCsMg#CV6+T4lUbgf@`zW}PYO23Si*j>bc`HeCB&@An zzAH-1&*%@t{%Q%ksJ`;=S@0!aUGTe75MDYK49@BbclF#NQH59dQ&5ycfmRYHSdA0( z?!k0hFa*>3U5(OX4Z9KG zmxc7l>tYtj&PR%oI@PPD6zh|LvrHprq`n9ro+s!o6T~m0Q!EB)pF6vnwM!v=o1e8T zWA{p46Mon#@qQ96ucPKbBxCJ}9xu4nIRD>M`ylQ!;Yfq~Qu4V=LNQazT3@F9CM;Pv zYfLL`<(;%xtG6*>tCytUK{8Fa5o{gSb$&ap7wih$kMFfs1)iF;(R$w^W;MW-Sb)-u ziFO96*=K$14M}`HC>}5Mf(G>ekSc%4Xpx`VSkq4H^B(qCP=2~0c#7BW9hSJw2Rp35 zUB#6*3(^^qGOAq?z?mZ3%iu~DQn-=ERnI;zN zY*X96y7V0YNY^cc|NERR{}`k7JGW(>bHoskJjaq;l)L&6UOzB@$u693}xc@5%jp((6hRDYq;45wv6JwvE% zxeIFBGP=293I;WNMh^zX_Qm91a-tfZrP^@y(sn(y)lcjOlJg{X*DSi=0Y$oi36Im% zcx)nU>yIk(-_t%tjVaCJT276%T(F1*8hS6x8}^|kK^2}<9ADi~L5r73iYVSj!cS^^ z+bX0_P>pj7T-6OiNMYYtY{B`JH}!T#>=HS6`Elg!rYW4z*wA7;^p>Z_{kkmvT$KDi zVY8Q>WC`fc`t(6%DyhNN%YIhu!jj#(l0_QDbzbiSA@7?@SSVN-5j`vFf_38Jy+=Gh zSS1qq55j_Tfj2R&hr?v)t&~5G=-==p{+$Od$}T@%;GHaGNXuGRdvJHnH2 z`phG%)y>qqk)J;!v0d%Hm&I17_p~8|8AFJ z>u~y9C&l<7v?f@2lk1#6l-oBIkE3OFsg|>mgw`U7g$L98o(i7@xT-*&bvVG`w@*Sv zQso4~*SQ(M1cGd!q{GwBL#VWX3-7PY=TsEG{ota4TR|;#wH7a6tX6BaCApDgj4_T?>MpekcXu^a z)wUakV910(W|El9u=$0=5C|~|3rWRd{`ndF6llL-VuV3;f?nUev8aUcW+=68Fs zWjoBgNjT^I!l zkIMWV*#J-X;Fss28v1N0?^AeQ)?{Q7`mSwpGnGqc?55Sqmdl#9BMom&mX=BCO)W}O zMoa5R$mCKf#B4{i97IEfcQcFEaPeK|@9O1Skx{I0TcLKnRPXWCOdEPCvF-S@td>b= z_BJP<>$Cq%MIbE3DlUqG8_x){KOBtGda-&C+PiQS=*v^&`scq$-W&UScJ83QL zTso~;`IB3nlW{`3|2yhFsl_t=T6#*??7}Ab)AT#1!28>StZCcER7&4w>DiPP*VB5Y zuqTi2#(R}Ip)c>tDr9vljb+YkKY1OF%Q~U$KVu~dJkHWJ$FvHpCuK}oeEq?BvU$z2 z%$>XoJ9Nu7%nVOeJ6c1n;U+ME^3I-{*&1%`Xu6!h*@=psBl6;X77Y9W;2K~8cpGVl zI1dLp0M6rk-+A|4aC(``hf_*vcoe=rI%>p2D3>jk_0=QZ0+eOM>3HX9e`>LiOeuW# zl{xJ+B8`8`1AiKhp62g=k1{`*Q%*rzxtt&GClDqTe<&p>vf}iY4^~7pj=C_ zT%qsfabije{e~!`!v4my5#=s;k2>e}wHsx2&nc(i+j3vI%$&=l5Tz{2-aDuK+`jU4 zE3#6en^5MN_bA8f=M0|L_Ylei(0dP7-CdB?c9-I%=+XGarG(iN^2_mMrG7SYf<(xa;@5Cx3ZdNomrC(+eT{qrBQ%yKG<0e) zUj8k|%l+L~%5N$a_}&TS%keWx`LieFm*Z!Z@|DHKm(`{CWGP;XQaN63|4=Fat#Sc8 zD}lN6z_ZHn(z5+O$N9HBM2nB3zvcQ$%YC%Ia)EMwIbK@ka`dF~6QvxfX{2$o`zl0t z^Ax^?_y~i*Ily}0Lcjoa0Url$0PX-D03HE;4E!4~3xG#aHJ1KVpru3tNIDiNwUcP*?e>xpRe|mUo2^U)8?SPs>)Qei~9k?OjM-*?}Lm>q1(nUP!Cd zj;{68+_5T13p*E6bC=40!n^Qw_{X$T?P`C7y6|hz`D&M{b_8isUD>{36)c5fD9>Z5 zKZt_Ml@n3qJ?4imbwBW7;6~sv;2GdQfcY426fl6RfxCd81OE>AK?kdVByjZ{;Im@j zi9D40!&%MiuRD6iOq+=ugmuD98S$BXDwv%y?d*)^=t^44WK27sa$thk)4B!yMzM2K zkkgK$wkUBZ{HeTS#K)tXRV8I?*OjcT=Mp9?7hbfze`sKISXGRSgKy&wOajHylerWZ z*NQofNZd?Mp|At#tib3{(uQN&hTT6frerPCF@q*-3C?L-lyNJrOlxT)l~;kvZ;dFg9340_n?--2!g}G>0BHoVa(`CIGM^hS{maD#`CtL^Y2qe5?S9T=>@tn zlZ!)xvy?X}61R-(tt4)x`psw3;iXEUHk%7^#nFtE5*r9wm@D)@3)Mr3n;9f$`0&)x zp;+WJHj^;ZNI8|p+J__K>Zx?VGNw%1OzX;&@dmYHh&w3)+sV9+0PEM4wy$8mc%kLn9ykd4{3KA1`}39-%VF1a&bq?#P#c0x0czS50UsV zUBM@EsAuecP@EdiJ7#>w`ZD##IyyspnbN7A%EUXF5>YbrW!+NL^1|ehzj}}2CWhWkciv5xV+ zkqDp7iLuD&`k@{*x;fUVMv8M$Y=-{K@mJ$>t`_kzt_1jA=39LV;G+2>02j@BfiD4H z10Dr9k9!e#8F+gR@LCK!k%#jB#p^44kARSaMSHuNo{kj^??Sl~_{%sFUu)$#d|)!b zaIV}0oCz!k&I0%N1nmzFw-5G*BS=;84n#(JVrpVtI5Zk=?Hm}6 z4r2?sX;zF5M8~5&{lT8m{@~`wU?di_;9py zMWk~y7QuGYY&FQmf)|oIv94o9XU~e(@W|K&sEAyqMfbk#OPRX9vwn~_ReTO=Vil~msu86J9asJkxDO zD+oYrZpDEtR*($`0)$sWaZDvXV`dUoS#k%5T9lnL5J+dzo0S^GvgH^lEvW|~vf{?H z5m%C_eB6xDp6Of$V%W^s8z^QwMmo2Zq|Nki5E8nQgt6?*KnmMfaw9WiOhL*!xhY+l zHet@MrB^*%X6sNCoEaUyos<&SQz^Sud6J|omI7VWQnra2@GohYzo8DuxiLD-OsiEH zz~G=NC_6OExPaInNUCSlL+9DH88?V2JB=GCrdWo({aSjp79u_ik*y>QZAy1^8_EW} z8W!3#r*zwJ@(1Z5Teos)C8h7sQ}$PB)@yUAo<>}RwavzJLgVd!G?80*H!l`ENaKiikq;?P?xC84jR z8ta!dySN1lW9E{WG4v9ej00^Vvt60dY$dN_=1c`sm(bIgZO3|(W|y`qTfnb(K-edg zsk}0#Pmk-DK(EoWaNLGgZEfu>Xkcbp1N#(LfM;bBy32B81!KVxEz_)E*gG_VT7+1_ z%$Qby3C(mUp8|P6(aC{w%u9tAq1xo&2f}+ z*{#EL*o)=2^BIr^pW}ow4f0e5dSk(Vq8j~oY;nTP#K$Sd^$~k9d7mJqZ)^h-o}uWa z`LrIP#rZ)t>pxD{Y*hwxP(c+g;uUzdw#V2Z5cib1D<6bLnKo81rc$<~4qZrC&Pibs z%8zX%6|!vok{ou^H;MgPtVw=CWWQwBp{j7JGLmytf%UL?PP4Q|v-Ae>RT0_`_MZWb z7RzayRziQCv3Erk>hAyvA@RN;y;X^%&~8TC!**q%EJNYDhh3G%gc;C3AEPm@H#)b8 zYcr1hNkO`G)ZBrURXew~!t{Sx@PnI(2gS_=rF9-wVA{+h=~nkzN}sA*%$1A?@`%d zSh;x~d8=N#i#+HvoBh44OFNvpe zV1<^>s?YMW7U5Od*$Hk3?uQ*)Fjs6i06)ZNOiC!r$D?D-dj*;=RNUo>*?Qd4uaF}! z*-T;wrc$Pv0Q=Il^bVq1-FvVb!2(d(5l;v!@T|asRBQc^#3x-1@h51W_5`t;g%~C; z?tDjJpJy>61M}`BclR`!{98{pKBHv}J1u)A!Kp2CYKH2KDj{J1fcfiPg6nh47xF$l z1Fl&xEQ2}g51yrVzNg}KFSsC7m1fA61mvG(;!sMN$!E=P5o@K%jGjf>O+rK7CwmsS zmKx?0F6wtJ(boxVGGR=^dVPjTZ%LkEA%s30k`>*0+U?)sl0qBePQBcyZV=ppK%e#; zk+q`1cbxgSs1lph@Z5``+lm?rhC^qx%k&}xpKo*1qayo)*R`622iRGLvqth?BZCCp zS#%I$I`x$6wD79EvM?xVMVr}@PMgt}Q6Mi@iXS84LF!gpx`J{A{merT6XeFlrjdEA zlD-In8l0%VP z@?rL3Xlv`hF`N10mGn(gVKLv$*oS=G+cW0QM}#Ytq=plhV6u(Qfa`z+u_EBiuv%k` zO->t@?I_DC&w~z_Hnxc3v&7zjg758|G0bylscpcQ(<89gytGUTgJ@cnT5)!va-Pq>21QVXBg21jCB6gcV(_Z!q zCSKr@2jS}WbtiPo=ybDJDyYpT4U25jOCXN>q<}dn(!A<`{ZsK8H5(2GvthMGYSNsn zBwSDapcL2#YJ*Av37=ao-U?|BWy*j|$$I>^Fh6aSUbJ&|T+cdCmQv74Rta`G54K|^ z{zY1d667lOBrz8Y829<;0w4PZqnCtBGzEJmt$$8n z*UHMSmZv;{^iKIIOaX=qkB2K@ZDs5#H~V!3eMj!*ZjOtp>970|cux{%dB&sBNMvHt zLkB(dBnxCMMxSx1@U`6Qp%*KeAUEmGGKPJtLtPv15*i|3trUOmQPFCz5J=oj43q{% zBXN*ul4^Qpg!J?8ws`78trl*a}vy%K# zpR|nFEgsqKueuhZ4u>F4@CFvpK3~n2iu47>Y`1h;AOt1qT<`7{MK_>q zSf3;Dw-V+2VM|MNkYZYgS;WDfn`HDvmC#UiUaQidvsRO53PbaXD_lZWaIf|XFS%$0 zlMMsaMNdoe$1B*mE@1~rN+tWe=j%mRkYm_(?zS2gdXM6+i(gs^??k7w(`DvdvX=IXw9Cg9SH`(7 zN!NZT&Nm#Y+Df}yl^8a<5&u7u@WX(#Us`TtK1^b`GF9+O<<4<+M^Y){LKoe`LeQk2 zm9k^H4fP>@TjkA>tOYeA9Nu~{#h~&{rS1>{$(h;^)VrX-1_k-M#HuNjHLV9l_9e!u zL~lO@rheje-|ed&<=z*6CvGS_Te(zAsiI5h;~RsEs2%#Hu}jqI8trtC%gy{}hynIWWgi@H+6)d+SSXvbzUYZ?U)z3xwpIu~aMw)5 zYQ^(J?*g__-vtBWf{G@6*G+C3CFvT${|24L?)R}|y|l?Ae5cyg82IHJciYg=(6trx zJd;UbCUcbZGp{1TC*O~y^ zK98npk7ER#TpGgRj=Dg+sex88|Htc^1)rC$XRdh_{&rVYjdoFOpliY_^)mmEAooTl zV33}$LLiYUMlb`LuZF%gEC$xpQc-X?Q!q-3n>z&86|`m_O-2imC;Vvi`q|R;dg>m zPh;8?^!1;4DN+&0p5b8!8ii}zJz#XvzaioGRl?WZry2JAM&WIDD4vgN@^W+flgYdQM|4FUW6 z?$ER`^-^7p9rboAY12L&IFLr4VT{69Pkp38x~^AGIa+^bXyt4` zy1h1Lq_dh)T?flFmFOipz%Bt%gA2z6LCjPL*9me&peiQ*W9?YM7vHp$-X%EW}RzewKPwZwz$|kqAN_| zMww2nraP*oGpYI=wS2) zKcg$$(Q~^*6>i}Veo+qiT1XscHMZZq2h(lB5KQWKH%gB;2#@&#mrhs2GI^+J&ysN4 z0#|!QC}HIeO2Q4S)?QODd@~@vSwml06wsfjkC-4k4=G0CEVr6a%uo2vOBq=s@dct^ zx%|!|LA(o{Vi8FD?D^HKO$z8+y{vf=yI*p{5qV0L_z($K)zflcC~fWtpDH-jxLL5L z_8{&-;RuEMQgYeL1Cf-LG4G;7DOj>_)}+j&nX{8(t=`InEpC$hhbfhU8^O|HUFWvr z;=(T9L-<~6QQ)ab8Z8ghF|&aVQyxk)Cfe?+W}oq}Hzo0Tzj&(D4I0q@U8?*AqdG6O zvZkHpEiU%BUw*p5{~@>6JtT3P4|W)Qr4v+nn;@MlDI?nDKH7CE+sELd781D0#^lRH z`kYJL-^iX{*v*d-Zt@DBYM?*LiZVqk($T86{lmf|t~N+mW0#R>OPjFqGs^OQ*q{d* zgceZjyMDT95j^V`Z2l)dT&Mn)MU1^t!(M5WPm$RtT(!CF0=+GW*NXFpeisn_+2`3W z%9nfR2QO+6cMDBH_0oBlD9hp8jH%}ewaxI=>Wk>Mib)vM@EINSi*57CyWn&+I779; z>V<83VvCp9UXpVpcGc8f{IDWj%!DUsax5|)wDcb+@@MIQqDGYEV_H^?G+$iDd=1@~ zm5K!3H=9_SSinchMvcfGlV+E_&Nl<9t;XeizZN{C&qfKUk&0Apw7u{DKqkQ}U& zqvtB{Hx@kiOYuA%SO)O32Y$ZFz4z@v7qAN8=dDLWIZ|qm`{B%# zAg};OB3a;{fyJ2c9Pl`w_;M&mO5^(#(vPg2ah$BZx~=VEEdv#-qgAuwGw_+UmM++| znpU!nZ~bkymWAP9w{bUU+j#Mo+SS_u^Iwtx`2k+&ScT!K9C4v#qx0ZcRxbML(rQ}}O1;rkEn%YJZQmfsyd>2~F@ zd$GsgW;~Yum){|rdlz`_WwVnnFP@nD{4?TJHkIVYYSK@>FvPV@GE~h@YFZ=)RzU(0MgGk~3 G*8CSYEx*+O literal 0 HcmV?d00001 diff --git a/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv new file mode 100644 index 00000000..86424daf --- /dev/null +++ b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv @@ -0,0 +1,6 @@ +biodomain,abbr,label,color,go_id,goterm_name,n_symbol,symbol,ensembl_gene_id +Synapse,Sy,Synapse [Sy],#329a33,GO:0005102,signaling receptor binding,362.0,TNMD,ENSG00000000005 +Proteostasis,Pr,Proteostasis [Pr],#c8b269,,endopeptidase activity,73.0,TNMD,ENSG00000000005 +Apoptosis,Ap,Apoptosis [Ap],,GO:0006915,apoptotic process,577.0,DPM1,ENSG00000000419 +Structural Stabilization,,,#ff9a9a,GO:0030863,cortical cytoskeleton,21.0,SCYL3,ENSG00000000457 +Synapse,Sy,Synapse [Sy],#329a33,GO:0034704,,,TNMD,ENSG00000000005 diff --git a/tests/test_assets/gene_info/input/igap_good_input.csv b/tests/test_assets/gene_info/input/igap_good_input.csv new file mode 100644 index 00000000..f73a59da --- /dev/null +++ b/tests/test_assets/gene_info/input/igap_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,hgnc_symbol +ENSG00000000419,ADAMTS1 +ENSG00000000460,APH1B +ENSG00000000971, +ENSG00000001084,UNC5CL +ENSG00000001460,ICA1 diff --git a/tests/test_assets/gene_info/input/median_expression_good_input.csv b/tests/test_assets/gene_info/input/median_expression_good_input.csv new file mode 100644 index 00000000..06e1f0ae --- /dev/null +++ b/tests/test_assets/gene_info/input/median_expression_good_input.csv @@ -0,0 +1,11 @@ +ensembl_gene_id,min,first_quartile,median,mean,third_quartile,max,tissue +ENSG00000000419,4.13956136706118,4.81701951155434,4.99243993099552,4.97609994885496,5.12201928513349,5.47427830219618,TCX +ENSG00000000457,1.87170194617441,3.30725630078229,3.55768647846662,3.53029087812701,3.79362805388354,4.29012220609186,TCX +ENSG00000000971,2.37922531067355,3.77894297956489,4.77526159965158,4.66424852097104,5.42402633058216,7.4816750917471, +ENSG00000001036,2.54806826249572,3.41668592265363,3.69078894708659,3.67318399089501,3.89709834932296,4.68782752662201,TCX +ENSG00000001631,2.38916769066134,3.42971842714164,3.58242842659155,3.56276683938211,3.71417050416855,3.99659888901762,TCX +ENSG00000000419,2.46285760909518,3.81231183101654,4.11076676482308,4.10960211499538,4.40350009435052,5.84841488692408,DLPFC +ENSG00000000971,1.7463973196137,3.69304174986212,4.33355530502762,4.38230106499301,4.99383675215591,8.17049589317842,DLPFC +ENSG00000000419,2.43407916590901,3.81701148864582,4.16532587564932,4.13384839958097,4.50050882932851,5.35079455494603,IFG +ENSG00000001036,2.03791264465019,3.43682539452037,3.63555121276883,3.63300507492418,3.86171607052173,4.79978134261528,IFG +ENSG00000001631,3.0677246327387,3.75499700008387,3.88052592466859,3.87774467019765,4.01405541625413,4.42188371387498,IFG diff --git a/tests/test_assets/gene_info/input/proteomics_good_input.csv b/tests/test_assets/gene_info/input/proteomics_good_input.csv new file mode 100644 index 00000000..3d88d523 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_good_input.csv @@ -0,0 +1,17 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +DPM1|O60762,DPM1,O60762,ENSG00000000419,DLPFC,-0.000532480602341678,0.108873058799453,-0.109938020004136,0.999999279258724,0.99999999995416 +GCLC|P48506,GCLC,P48506,ENSG00000001084,DLPFC,0.172360117768928,0.241008334458834,0.103711901079023,3.16356041363264e-09,6.82372624103319e-08 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,DLPFC,,,,, +DPM1|O60762,DPM1,O60762,ENSG00000000419,MFG,0.175238552542385,0.813647160416635,-0.463170055331864,0.784427093629634,0.999999988150877 +GCLC|P48506,GCLC,P48506,ENSG00000001084,MFG,0.0113505662359543,0.230040224838345,-0.207339092366436,0.991302618555794,0.999999988150877 +CFH|P08603,CFH,P08603,ENSG00000000971,,0.00509733585480276,0.799985494934676,-0.789790823225071,0.999866632256444,0.999999988150877 +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,MFG,,,,, +DPM1|O60762,DPM1,,ENSG00000000419,TCX,-0.149342443361856,0.0105903497498241,-0.309275236473537,0.072771233327826,0.382525226498101 +GCLC|P48506,GCLC,P48506,ENSG00000001084,TCX,-0.0429270360844782,0.0424170350995262,-0.128271107268483,0.461818930212383,0.999999901144234 +CFH|P08603,CFH,P08603,ENSG00000000971,TCX,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,TCX,,,,, +DPM1|O60762,DPM1,O60762,ENSG00000000419,AntPFC,0.0518224255427633,0.214895275697269,-0.111250424611742,0.73447214737169,0.999999904091243 +GCLC|P48506,,P48506,ENSG00000001084,AntPFC,-0.0232034474563143,0.0767886612547822,-0.123195556167411,0.848120820467051,0.999999904091243 +CFH|P08603,CFH,P08603,ENSG00000000971,AntPFC,,,,, +,CYP51A1,Q16850,ENSG00000001630,AntPFC,0.264726640853272,0.566731977706236,-0.0372786959996912,0.0989464176618443,0.564688784959532 diff --git a/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv b/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv new file mode 100644 index 00000000..2ef92fa6 --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_srm_good_input.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CD2AP|Q9Y5K6,CD2AP,Q9Y5K6,ENSG00000001630,DLPFC,0.026321079163413537,0.10244645627426802,-0.049804297947440936,0.6959456373623885,0.9515991368016332 +,SNCA,P37840,ENSG00000001629,DLPFC,-0.02568871764608687,0.028374837069623488,-0.07975227236179723,0.5048364890502188,0.7196605269439289 +NDUFA7|O95182,,O95182,ENSG00000000419,DLPFC,-0.08256234002725381,-0.03426991654584031,-0.13085476350866732,0.00019022687174463648,0.0010779522732196067 +DIP2B|Q9P265,DIP2B,,ENSG00000001036,DLPFC,0.002289361997716892,0.07241425980972072,-0.06783553581428692,0.9967679822609551, +VSNL1|P62760,VSNL1,P62760,ENSG00000000971,,-0.013745470109882328,0.014686673450034729,-0.042177613669799385,0.4928746100735819,0.7178825842376084 +SYT11|Q9BT88,SYT11,Q9BT88,ENSG00000000005,DLPFC,0.029120588040412152,0.07403386661338425,-0.015792690532559942,0.2810529496558294,0.4955407270247518 diff --git a/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv b/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv new file mode 100644 index 00000000..a491fc2b --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_tmt_good_input.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CYP51A1|A0A0C4DFL7,CYP51A1,A0A0C4DFL7,ENSG00000001630,DLPFC,-0.096536825520855,-0.0249064834553162,-0.168167167586394,0.00861397335314594,0.0596617463116164 +ANKIB1|Q9P2G1,ANKIB1,Q9P2G1,ENSG00000001629,,-0.0231875933838178,-0.00261466480061534,-0.0437605219670202,0.0274462753402693,0.124442505184433 +,DPM1,H0Y368,ENSG00000000419,DLPFC,-0.0318866351834386,-0.00176441716357351,-0.0620088532033036,0.038161929543103,0.153081771056205 +FUCA2|Q9BTY2,,Q9BTY2,ENSG00000001036,DLPFC,-0.0889169453741395,0.00602502277420743,-0.183858913522486,0.0661508543031291,0.213410933915364 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,0.156029348948909,-0.00385979845880688,0.0619708516992043,0.204796476548682 +KRIT1|O00522,KRIT1,,ENSG00000001631,DLPFC,0.052873152684011,0.12115434837832,-0.0154080430102979,0.128072571527854,0.318778967948762 diff --git a/tests/test_assets/gene_info/input/target_list_good_input.csv b/tests/test_assets/gene_info/input/target_list_good_input.csv new file mode 100644 index 00000000..90faf6ea --- /dev/null +++ b/tests/test_assets/gene_info/input/target_list_good_input.csv @@ -0,0 +1,6 @@ +source,team,rank,ensembl_gene_id,hgnc_symbol,target_choice_justification,predicted_therapeutic_direction,data_used_to_support_target_selection,data_synapseid,study,input_data,validation_study_details,initial_nomination +Source_1,Team_1,17-100,ENSG00000000005,TNMD,"Based on a network of genomic and epigenomic elements in the region of this gene, in combination with phenotypes, we've determined it is most likely that the gene causally affects tau tangle counts in the ROSMAP cohort (see www.molecular.network).",Antagonism predicted to reduce disease progression.,"genotype, RNAseq, tau tangle counts",syn12345,Study_1,"Genetics, RNA, Protein, Clinical",,2018.0 +Source_1,Team_2,,ENSG00000000419,DPM1,Genes are upregulated in AD based on single cell analysis of microglia subtypes,Antagonism predicted to reduce disease progression.,scRNAseq and snRNAseq in microglia,,Study_1,,Not prioritized for experimental validation ,2023.0 +Source_2,Team_3,12,ENSG00000000419,DPM1,"By applying the algorithm and process developed in https://doi.org/10.1038/s41598-020-79740-x, we identified frequently co-expressed gene modules",,AD and normal brain transcriptomic datasets,,Study_2,RNA,not prioritized for experimental validation,2022.0 +Source_3,Team_4,1-10,ENSG00000000419,DPM1,,Agonism predicted to reduce disease progression.,TMT proteomics (partial dataset),syn56789,Study_3,"Protein, Clinical",not prioritized for experimental validation,2018.0 +Source_4,Team_5,,ENSG00000000457,SCYL3,Genes are downregulated in AD based on single cell analysis of microglia subtypes,Agonism predicted to reduce disease progression.,,,Study_3,RNA,Not prioritized for experimental validation , diff --git a/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv new file mode 100644 index 00000000..fdeaf6d5 --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,True,True +ENSG00000000419,DPM1,, +ENSG00000001497,LAS1L,True, +ENSG00000001084,GCLC,,True +ENSG00000183791,ABCD,, diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_1.json b/tests/test_assets/gene_info/output/gene_info_good_output_1.json new file mode 100644 index 00000000..274dee47 --- /dev/null +++ b/tests/test_assets/gene_info/output/gene_info_good_output_1.json @@ -0,0 +1,825 @@ +[ + { + "ensembl_gene_id": "ENSG00000000005", + "name": "tenomodulin", + "summary": "This gene encodes a protein that is related to chondromodulin-I, which is a cartilage-specific glycoprotein that functions to stimulate chondrocyte growth and to inhibit tube formation of endothelial cells. This protein is also an angiogenesis inhibitor. Genetic variation in this gene is associated with a risk for type 2 diabetes, central obesity and serum levels of systemic immune mediators in a body size-dependent manner. This gene is also a candidate gene for age-related macular degeneration, though a direct link has yet to be demonstrated. [provided by RefSeq, Sep 2009].", + "symbol": "TNMD", + "alias": [ + "BRICD4", + "CHM1L", + "TEM" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_1", + "rank": "17-100", + "hgnc_symbol": "TNMD", + "target_choice_justification": "Based on a network of genomic and epigenomic elements in the region of this gene, in combination with phenotypes, we've determined it is most likely that the gene causally affects tau tangle counts in the ROSMAP cohort (see www.molecular.network).", + "predicted_therapeutic_direction": "Antagonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "genotype, RNAseq, tau tangle counts", + "data_synapseid": "syn12345", + "study": "Study_1", + "input_data": "Genetics, RNA, Protein, Clinical", + "validation_study_details": null, + "initial_nomination": 2018.0 + } + ], + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 7, + "safety_bucket": 4, + "abability_bucket": 3, + "pharos_class": "Tbio", + "classification": null, + "safety_bucket_definition": "More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": "Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components." + } + ], + "total_nominations": 1.0, + "biodomains": [ + "Proteostasis", + "Synapse" + ], + "is_adi": true, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22TNMD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" + } + }, + { + "ensembl_gene_id": "ENSG00000000419", + "name": "dolichyl-phosphate mannosyltransferase subunit 1, catalytic", + "summary": "Dolichol-phosphate mannose (Dol-P-Man) serves as a donor of mannosyl residues on the lumenal side of the endoplasmic reticulum (ER). Lack of Dol-P-Man results in defective surface expression of GPI-anchored proteins. Dol-P-Man is synthesized from GDP-mannose and dolichol-phosphate on the cytosolic side of the ER by the enzyme dolichyl-phosphate mannosyltransferase. Human DPM1 lacks a carboxy-terminal transmembrane domain and signal sequence and is regulated by DPM2. Mutations in this gene are associated with congenital disorder of glycosylation type Ie. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Nov 2015].", + "symbol": "DPM1", + "alias": [ + "MPDS", + "CDGIE" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_2", + "rank": null, + "hgnc_symbol": "DPM1", + "target_choice_justification": "Genes are upregulated in AD based on single cell analysis of microglia subtypes", + "predicted_therapeutic_direction": "Antagonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "scRNAseq and snRNAseq in microglia", + "data_synapseid": null, + "study": "Study_1", + "input_data": null, + "validation_study_details": "Not prioritized for experimental validation ", + "initial_nomination": 2023.0 + }, + { + "source": "Source_2", + "team": "Team_3", + "rank": "12", + "hgnc_symbol": "DPM1", + "target_choice_justification": "By applying the algorithm and process developed in https://doi.org/10.1038/s41598-020-79740-x, we identified frequently co-expressed gene modules", + "predicted_therapeutic_direction": null, + "data_used_to_support_target_selection": "AD and normal brain transcriptomic datasets", + "data_synapseid": null, + "study": "Study_2", + "input_data": "RNA", + "validation_study_details": "not prioritized for experimental validation", + "initial_nomination": 2022.0 + }, + { + "source": "Source_3", + "team": "Team_4", + "rank": "1-10", + "hgnc_symbol": "DPM1", + "target_choice_justification": null, + "predicted_therapeutic_direction": "Agonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "TMT proteomics (partial dataset)", + "data_synapseid": "syn56789", + "study": "Study_3", + "input_data": "Protein, Clinical", + "validation_study_details": "not prioritized for experimental validation", + "initial_nomination": 2018.0 + } + ], + "median_expression": [ + { + "min": 4.13956136706118, + "first_quartile": 4.81701951155434, + "median": 4.99243993099552, + "mean": 4.97609994885496, + "third_quartile": 5.12201928513349, + "max": 5.47427830219618, + "tissue": "TCX" + }, + { + "min": 2.46285760909518, + "first_quartile": 3.81231183101654, + "median": 4.11076676482308, + "mean": 4.10960211499538, + "third_quartile": 4.40350009435052, + "max": 5.84841488692408, + "tissue": "DLPFC" + }, + { + "min": 2.43407916590901, + "first_quartile": 3.81701148864582, + "median": 4.16532587564932, + "mean": 4.13384839958097, + "third_quartile": 4.50050882932851, + "max": 5.35079455494603, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": 3.0, + "biodomains": [ + "Apoptosis" + ], + "is_adi": false, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22DPM1%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" + } + }, + { + "ensembl_gene_id": "ENSG00000000457", + "name": "SCY1 like pseudokinase 3", + "summary": "This gene encodes a protein with a kinase domain and four HEAT repeats. The encoded protein interacts with the C-terminal domain of ezrin, an ERM protein, and may play a role in cell adhesion and migration. Alternative splicing results in multiple transcript variants encoding multiple isoforms. [provided by RefSeq, Jun 2012].", + "symbol": "SCYL3", + "alias": [ + "PACE-1", + "PACE1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": [ + { + "source": "Source_4", + "team": "Team_5", + "rank": null, + "hgnc_symbol": "SCYL3", + "target_choice_justification": "Genes are downregulated in AD based on single cell analysis of microglia subtypes", + "predicted_therapeutic_direction": "Agonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": null, + "data_synapseid": null, + "study": "Study_3", + "input_data": "RNA", + "validation_study_details": "Not prioritized for experimental validation ", + "initial_nomination": null + } + ], + "median_expression": [ + { + "min": 1.87170194617441, + "first_quartile": 3.30725630078229, + "median": 3.55768647846662, + "mean": 3.53029087812701, + "third_quartile": 3.79362805388354, + "max": 4.29012220609186, + "tissue": "TCX" + } + ], + "druggability": null, + "total_nominations": 1.0, + "biodomains": [ + "Structural Stabilization" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" + } + }, + { + "ensembl_gene_id": "ENSG00000000460", + "name": "FIGNL1 interacting regulator of recombination and mitosis", + "summary": null, + "symbol": "FIRRM", + "alias": [ + "FLIP", + "MEICA1", + "C1orf112", + "Apolo1" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 13, + "safety_bucket": 3, + "abability_bucket": 3, + "pharos_class": "Tdark", + "classification": "Unknown: There is no information on ligands or structure in any of the categories above. ", + "safety_bucket_definition": "Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": null + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" + } + }, + { + "ensembl_gene_id": "ENSG00000000938", + "name": "FGR proto-oncogene, Src family tyrosine kinase", + "summary": "This gene is a member of the Src family of protein tyrosine kinases (PTKs). The encoded protein contains N-terminal sites for myristylation and palmitylation, a PTK domain, and SH2 and SH3 domains which are involved in mediating protein-protein interactions with phosphotyrosine-containing and proline-rich motifs, respectively. The protein localizes to plasma membrane ruffles, and functions as a negative regulator of cell migration and adhesion triggered by the beta-2 integrin signal transduction pathway. Infection with Epstein-Barr virus results in the overexpression of this gene. Multiple alternatively spliced variants, encoding the same protein, have been identified. [provided by RefSeq, Jul 2008].", + "symbol": "FGR", + "alias": [ + "c-fgr", + "SRC2", + "p55c-fgr", + "p58-Fgr", + "p55-Fgr", + "p58c-fgr", + "c-src2" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" + } + }, + { + "ensembl_gene_id": "ENSG00000000971", + "name": null, + "summary": "This gene is a member of the Regulator of Complement Activation (RCA) gene cluster and encodes a protein with twenty short consensus repeat (SCR) domains. This protein is secreted into the bloodstream and has an essential role in the regulation of complement activation, restricting this innate defense mechanism to microbial infections. Mutations in this gene have been associated with hemolytic-uremic syndrome (HUS) and chronic hypocomplementemic nephropathy. Alternate transcriptional splice variants, encoding different isoforms, have been characterized. [provided by RefSeq, Oct 2011].", + "symbol": "CFH", + "alias": [], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.37922531067355, + "first_quartile": 3.77894297956489, + "median": 4.77526159965158, + "mean": 4.66424852097104, + "third_quartile": 5.42402633058216, + "max": 7.4816750917471, + "tissue": null + }, + { + "min": 1.7463973196137, + "first_quartile": 3.69304174986212, + "median": 4.33355530502762, + "mean": 4.38230106499301, + "third_quartile": 4.99383675215591, + "max": 8.17049589317842, + "tissue": "DLPFC" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 3, + "safety_bucket": 4, + "abability_bucket": 1, + "pharos_class": "Tbio", + "classification": "Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).", + "safety_bucket_definition": "More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": "Secreted protein. Highly accessible to antibody-based therapies." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" + } + }, + { + "ensembl_gene_id": "ENSG00000001036", + "name": "alpha-L-fucosidase 2", + "summary": "This gene encodes a plasma alpha-L-fucosidase, which represents 10-20% of the total cellular fucosidase activity. The protein is a member of the glycosyl hydrolase 29 family, and catalyzes the hydrolysis of the alpha-1,6-linked fucose joined to the reducing-end N-acetylglucosamine of the carbohydrate moieties of glycoproteins. This enzyme is essential for Helicobacter pylori adhesion to human gastric cancer cells. [provided by RefSeq, Aug 2010].", + "symbol": "FUCA2", + "alias": [ + "dJ20N2.5" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.54806826249572, + "first_quartile": 3.41668592265363, + "median": 3.69078894708659, + "mean": 3.67318399089501, + "third_quartile": 3.89709834932296, + "max": 4.68782752662201, + "tissue": "TCX" + }, + { + "min": 2.03791264465019, + "first_quartile": 3.43682539452037, + "median": 3.63555121276883, + "mean": 3.63300507492418, + "third_quartile": 3.86171607052173, + "max": 4.79978134261528, + "tissue": "IFG" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 3, + "abability_bucket": 1, + "pharos_class": "Tchem", + "classification": "Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.", + "safety_bucket_definition": null, + "abability_bucket_definition": "Secreted protein. Highly accessible to antibody-based therapies." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" + } + }, + { + "ensembl_gene_id": "ENSG00000001084", + "name": "glutamate-cysteine ligase catalytic subunit", + "summary": "Glutamate-cysteine ligase, also known as gamma-glutamylcysteine synthetase is the first rate-limiting enzyme of glutathione synthesis. The enzyme consists of two subunits, a heavy catalytic subunit and a light regulatory subunit. This locus encodes the catalytic subunit, while the regulatory subunit is derived from a different gene located on chromosome 1p22-p21. Mutations at this locus have been associated with hemolytic anemia due to deficiency of gamma-glutamylcysteine synthetase and susceptibility to myocardial infarction.[provided by RefSeq, Oct 2010].", + "symbol": "GCLC", + "alias": [ + "GLCL", + "GCL", + "GLCLC", + "GCS" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 5, + "abability_bucket": 3, + "pharos_class": null, + "classification": "Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.", + "safety_bucket_definition": "Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.", + "abability_bucket_definition": "Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22GCLC%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + }, + { + "ensembl_gene_id": "ENSG00000001167", + "name": "nuclear transcription factor Y subunit alpha", + "summary": "The protein encoded by this gene is one subunit of a trimeric complex, forming a highly conserved transcription factor that binds to CCAAT motifs in the promoter regions in a variety of genes. Subunit A associates with a tight dimer composed of the B and C subunits, resulting in a trimer that binds to DNA with high specificity and affinity. The sequence specific interactions of the complex are made by the A subunit, suggesting a role as the regulatory subunit. In addition, there is evidence of post-transcriptional regulation in this gene product, either by protein degradation or control of translation. Further regulation is represented by alternative splicing in the glutamine-rich activation domain, with clear tissue-specific preferences for the two isoforms. [provided by RefSeq, Jul 2008].", + "symbol": "NFYA", + "alias": [ + "CBF-B", + "NF-YA", + "HAP2", + "CBF-A" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" + } + }, + { + "ensembl_gene_id": "ENSG00000001460", + "name": "sperm tail PG-rich repeat containing 1", + "summary": "Involved in positive regulation of apoptotic process and positive regulation of mitochondrial membrane permeability involved in apoptotic process. Predicted to be located in mitochondrion and nucleus. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "STPG1", + "alias": [ + "C1orf201", + "MAPO2" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" + } + }, + { + "ensembl_gene_id": "ENSG00000001461", + "name": "NIPA like domain containing 3", + "summary": "Predicted to enable magnesium ion transmembrane transporter activity. Predicted to be involved in magnesium ion transport. Predicted to be integral component of membrane. Predicted to be active in membrane. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": null, + "alias": [ + "DJ462O23.2", + "SLC57A5", + "NPAL3" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" + } + }, + { + "ensembl_gene_id": "ENSG00000001497", + "name": "LAS1 like ribosome biogenesis factor", + "summary": "Enables RNA binding activity. Predicted to be involved in maturation of 5.8S rRNA and maturation of LSU-rRNA. Located in membrane. Part of MLL1 complex. Implicated in Wilson-Turner syndrome. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "LAS1L", + "alias": [ + "Las1-like", + "Las1", + "dJ475B7.2", + "MRXSWTS", + "WTS" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": true, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22LAS1L%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" + } + }, + { + "ensembl_gene_id": "ENSG00000001561", + "name": "ectonucleotide pyrophosphatase/phosphodiesterase 4", + "summary": "Enables bis(5'-adenosyl)-triphosphatase activity. Involved in positive regulation of blood coagulation and purine ribonucleoside catabolic process. Located in extracellular exosome and membrane. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "ENPP4", + "alias": [ + "NPP4" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" + } + }, + { + "ensembl_gene_id": "ENSG00000001617", + "name": "semaphorin 3F", + "summary": "This gene encodes a member of the semaphorin III family of secreted signaling proteins that are involved in axon guidance during neuronal development. The encoded protein contains an N-terminal Sema domain, an immunoglobulin loop and a C-terminal basic domain. This gene is expressed by the endothelial cells where it was found to act in an autocrine fashion to induce apoptosis, inhibit cell proliferation and survival, and function as an anti-tumorigenic agent. Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Jan 2016].", + "symbol": "SEMA3F", + "alias": [ + "SEMA4", + "SEMA-IV", + "SEMAK" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" + } + }, + { + "ensembl_gene_id": "ENSG00000001626", + "name": "CF transmembrane conductance regulator", + "summary": "This gene encodes a member of the ATP-binding cassette (ABC) transporter superfamily. The encoded protein functions as a chloride channel, making it unique among members of this protein family, and controls ion and water secretion and absorption in epithelial tissues. Channel activation is mediated by cycles of regulatory domain phosphorylation, ATP-binding by the nucleotide-binding domains, and ATP hydrolysis. Mutations in this gene cause cystic fibrosis, the most common lethal genetic disorder in populations of Northern European descent. The most frequently occurring mutation in cystic fibrosis, DeltaF508, results in impaired folding and trafficking of the encoded protein. Multiple pseudogenes have been identified in the human genome. [provided by RefSeq, Aug 2017].", + "symbol": "CFTR", + "alias": [ + "ABCC7", + "MRP7", + "CFTR/MRP", + "dJ760C5.1", + "TNR-CFTR", + "ABC35", + "CF" + ], + "is_igap": false, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" + } + }, + { + "ensembl_gene_id": "ENSG00000001629", + "name": "ankyrin repeat and IBR domain containing 1", + "summary": "Predicted to enable ubiquitin conjugating enzyme binding activity and ubiquitin protein ligase activity. Predicted to be involved in positive regulation of proteasomal ubiquitin-dependent protein catabolic process; protein polyubiquitination; and ubiquitin-dependent protein catabolic process. Predicted to be part of ubiquitin ligase complex. Predicted to be active in cytoplasm. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "ANKIB1", + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" + } + }, + { + "ensembl_gene_id": "ENSG00000001630", + "name": "cytochrome P450 family 51 subfamily A member 1", + "summary": "This gene encodes a member of the cytochrome P450 superfamily of enzymes. The cytochrome P450 proteins are monooxygenases which catalyze many reactions involved in drug metabolism and synthesis of cholesterol, steroids and other lipids. This endoplasmic reticulum protein participates in the synthesis of cholesterol by catalyzing the removal of the 14alpha-methyl group from lanosterol. Homologous genes are found in all three eukaryotic phyla, fungi, plants, and animals, suggesting that this is one of the oldest cytochrome P450 genes. Two transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Mar 2009].", + "symbol": "CYP51A1", + "alias": [ + "P450L1", + "P450-14DM", + "LDM", + "CP51", + "CYPL1", + "CYP51" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" + } + }, + { + "ensembl_gene_id": "ENSG00000001631", + "name": "KRIT1 ankyrin repeat containing", + "summary": "This gene encodes a protein containing four ankyrin repeats, a band 4.1/ezrin/radixin/moesin (FERM) domain, and multiple NPXY sequences. The encoded protein is localized in the nucleus and cytoplasm. It binds to integrin cytoplasmic domain-associated protein-1 alpha (ICAP1alpha), and plays a critical role in beta1-integrin-mediated cell proliferation. It associates with junction proteins and RAS-related protein 1A (Rap1A), which requires the encoded protein for maintaining the integrity of endothelial junctions. It is also a microtubule-associated protein and may play a role in microtubule targeting. Mutations in this gene result in cerebral cavernous malformations. Multiple alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Sep 2009].", + "symbol": "KRIT1", + "alias": [ + "CAM", + "CCM1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.38916769066134, + "first_quartile": 3.42971842714164, + "median": 3.58242842659155, + "mean": 3.56276683938211, + "third_quartile": 3.71417050416855, + "max": 3.99659888901762, + "tissue": "TCX" + }, + { + "min": 3.0677246327387, + "first_quartile": 3.75499700008387, + "median": 3.88052592466859, + "mean": 3.87774467019765, + "third_quartile": 4.01405541625413, + "max": 4.42188371387498, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" + } + }, + { + "ensembl_gene_id": "ENSG00000161149", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "107", + "ensembl_possible_replacements": [ + "ENSG00000284130" + ], + "ensembl_permalink": "https://jul2022.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000161149" + } + }, + { + "ensembl_gene_id": "ENSG00000183791", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22ABCD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "100", + "ensembl_possible_replacements": [ + "ENSG00000288631", + "ENSG00000288616", + "ENSG00000288607" + ], + "ensembl_permalink": "https://apr2020.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000183791" + } + } +] \ No newline at end of file diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_2.json b/tests/test_assets/gene_info/output/gene_info_good_output_2.json new file mode 100644 index 00000000..91bb925b --- /dev/null +++ b/tests/test_assets/gene_info/output/gene_info_good_output_2.json @@ -0,0 +1,825 @@ +[ + { + "ensembl_gene_id": "ENSG00000000005", + "name": "tenomodulin", + "summary": "This gene encodes a protein that is related to chondromodulin-I, which is a cartilage-specific glycoprotein that functions to stimulate chondrocyte growth and to inhibit tube formation of endothelial cells. This protein is also an angiogenesis inhibitor. Genetic variation in this gene is associated with a risk for type 2 diabetes, central obesity and serum levels of systemic immune mediators in a body size-dependent manner. This gene is also a candidate gene for age-related macular degeneration, though a direct link has yet to be demonstrated. [provided by RefSeq, Sep 2009].", + "symbol": "TNMD", + "alias": [ + "BRICD4", + "CHM1L", + "TEM" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_1", + "rank": "17-100", + "hgnc_symbol": "TNMD", + "target_choice_justification": "Based on a network of genomic and epigenomic elements in the region of this gene, in combination with phenotypes, we've determined it is most likely that the gene causally affects tau tangle counts in the ROSMAP cohort (see www.molecular.network).", + "predicted_therapeutic_direction": "Antagonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "genotype, RNAseq, tau tangle counts", + "data_synapseid": "syn12345", + "study": "Study_1", + "input_data": "Genetics, RNA, Protein, Clinical", + "validation_study_details": null, + "initial_nomination": 2018.0 + } + ], + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 7, + "safety_bucket": 4, + "abability_bucket": 3, + "pharos_class": "Tbio", + "classification": null, + "safety_bucket_definition": "More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": "Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components." + } + ], + "total_nominations": 1.0, + "biodomains": [ + "Proteostasis", + "Synapse" + ], + "is_adi": true, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22TNMD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000005" + } + }, + { + "ensembl_gene_id": "ENSG00000000419", + "name": "dolichyl-phosphate mannosyltransferase subunit 1, catalytic", + "summary": "Dolichol-phosphate mannose (Dol-P-Man) serves as a donor of mannosyl residues on the lumenal side of the endoplasmic reticulum (ER). Lack of Dol-P-Man results in defective surface expression of GPI-anchored proteins. Dol-P-Man is synthesized from GDP-mannose and dolichol-phosphate on the cytosolic side of the ER by the enzyme dolichyl-phosphate mannosyltransferase. Human DPM1 lacks a carboxy-terminal transmembrane domain and signal sequence and is regulated by DPM2. Mutations in this gene are associated with congenital disorder of glycosylation type Ie. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Nov 2015].", + "symbol": "DPM1", + "alias": [ + "MPDS", + "CDGIE" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": [ + { + "source": "Source_1", + "team": "Team_2", + "rank": null, + "hgnc_symbol": "DPM1", + "target_choice_justification": "Genes are upregulated in AD based on single cell analysis of microglia subtypes", + "predicted_therapeutic_direction": "Antagonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "scRNAseq and snRNAseq in microglia", + "data_synapseid": null, + "study": "Study_1", + "input_data": null, + "validation_study_details": "Not prioritized for experimental validation ", + "initial_nomination": 2023.0 + }, + { + "source": "Source_2", + "team": "Team_3", + "rank": "12", + "hgnc_symbol": "DPM1", + "target_choice_justification": "By applying the algorithm and process developed in https://doi.org/10.1038/s41598-020-79740-x, we identified frequently co-expressed gene modules", + "predicted_therapeutic_direction": null, + "data_used_to_support_target_selection": "AD and normal brain transcriptomic datasets", + "data_synapseid": null, + "study": "Study_2", + "input_data": "RNA", + "validation_study_details": "not prioritized for experimental validation", + "initial_nomination": 2022.0 + }, + { + "source": "Source_3", + "team": "Team_4", + "rank": "1-10", + "hgnc_symbol": "DPM1", + "target_choice_justification": null, + "predicted_therapeutic_direction": "Agonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": "TMT proteomics (partial dataset)", + "data_synapseid": "syn56789", + "study": "Study_3", + "input_data": "Protein, Clinical", + "validation_study_details": "not prioritized for experimental validation", + "initial_nomination": 2018.0 + } + ], + "median_expression": [ + { + "min": 4.13956136706118, + "first_quartile": 4.81701951155434, + "median": 4.99243993099552, + "mean": 4.97609994885496, + "third_quartile": 5.12201928513349, + "max": 5.47427830219618, + "tissue": "TCX" + }, + { + "min": 2.46285760909518, + "first_quartile": 3.81231183101654, + "median": 4.11076676482308, + "mean": 4.10960211499538, + "third_quartile": 4.40350009435052, + "max": 5.84841488692408, + "tissue": "DLPFC" + }, + { + "min": 2.43407916590901, + "first_quartile": 3.81701148864582, + "median": 4.16532587564932, + "mean": 4.13384839958097, + "third_quartile": 4.50050882932851, + "max": 5.35079455494603, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": 3.0, + "biodomains": [ + "Apoptosis" + ], + "is_adi": false, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22DPM1%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000419" + } + }, + { + "ensembl_gene_id": "ENSG00000000457", + "name": "SCY1 like pseudokinase 3", + "summary": "This gene encodes a protein with a kinase domain and four HEAT repeats. The encoded protein interacts with the C-terminal domain of ezrin, an ERM protein, and may play a role in cell adhesion and migration. Alternative splicing results in multiple transcript variants encoding multiple isoforms. [provided by RefSeq, Jun 2012].", + "symbol": "SCYL3", + "alias": [ + "PACE-1", + "PACE1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": [ + { + "source": "Source_4", + "team": "Team_5", + "rank": null, + "hgnc_symbol": "SCYL3", + "target_choice_justification": "Genes are downregulated in AD based on single cell analysis of microglia subtypes", + "predicted_therapeutic_direction": "Agonism predicted to reduce disease progression.", + "data_used_to_support_target_selection": null, + "data_synapseid": null, + "study": "Study_3", + "input_data": "RNA", + "validation_study_details": "Not prioritized for experimental validation ", + "initial_nomination": null + } + ], + "median_expression": [ + { + "min": 1.87170194617441, + "first_quartile": 3.30725630078229, + "median": 3.55768647846662, + "mean": 3.53029087812701, + "third_quartile": 3.79362805388354, + "max": 4.29012220609186, + "tissue": "TCX" + } + ], + "druggability": null, + "total_nominations": 1.0, + "biodomains": [ + "Structural Stabilization" + ], + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": null, + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000457" + } + }, + { + "ensembl_gene_id": "ENSG00000000460", + "name": "FIGNL1 interacting regulator of recombination and mitosis", + "summary": null, + "symbol": "FIRRM", + "alias": [ + "FLIP", + "MEICA1", + "C1orf112", + "Apolo1" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 13, + "safety_bucket": 3, + "abability_bucket": 3, + "pharos_class": "Tdark", + "classification": "Unknown: There is no information on ligands or structure in any of the categories above. ", + "safety_bucket_definition": "Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": null + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000460" + } + }, + { + "ensembl_gene_id": "ENSG00000000938", + "name": "FGR proto-oncogene, Src family tyrosine kinase", + "summary": "This gene is a member of the Src family of protein tyrosine kinases (PTKs). The encoded protein contains N-terminal sites for myristylation and palmitylation, a PTK domain, and SH2 and SH3 domains which are involved in mediating protein-protein interactions with phosphotyrosine-containing and proline-rich motifs, respectively. The protein localizes to plasma membrane ruffles, and functions as a negative regulator of cell migration and adhesion triggered by the beta-2 integrin signal transduction pathway. Infection with Epstein-Barr virus results in the overexpression of this gene. Multiple alternatively spliced variants, encoding the same protein, have been identified. [provided by RefSeq, Jul 2008].", + "symbol": "FGR", + "alias": [ + "c-fgr", + "SRC2", + "p55c-fgr", + "p58-Fgr", + "p55-Fgr", + "p58c-fgr", + "c-src2" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000938" + } + }, + { + "ensembl_gene_id": "ENSG00000000971", + "name": null, + "summary": "This gene is a member of the Regulator of Complement Activation (RCA) gene cluster and encodes a protein with twenty short consensus repeat (SCR) domains. This protein is secreted into the bloodstream and has an essential role in the regulation of complement activation, restricting this innate defense mechanism to microbial infections. Mutations in this gene have been associated with hemolytic-uremic syndrome (HUS) and chronic hypocomplementemic nephropathy. Alternate transcriptional splice variants, encoding different isoforms, have been characterized. [provided by RefSeq, Oct 2011].", + "symbol": "CFH", + "alias": [], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.37922531067355, + "first_quartile": 3.77894297956489, + "median": 4.77526159965158, + "mean": 4.66424852097104, + "third_quartile": 5.42402633058216, + "max": 7.4816750917471, + "tissue": null + }, + { + "min": 1.7463973196137, + "first_quartile": 3.69304174986212, + "median": 4.33355530502762, + "mean": 4.38230106499301, + "third_quartile": 4.99383675215591, + "max": 8.17049589317842, + "tissue": "DLPFC" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 3, + "safety_bucket": 4, + "abability_bucket": 1, + "pharos_class": "Tbio", + "classification": "Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).", + "safety_bucket_definition": "More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.", + "abability_bucket_definition": "Secreted protein. Highly accessible to antibody-based therapies." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000000971" + } + }, + { + "ensembl_gene_id": "ENSG00000001036", + "name": "alpha-L-fucosidase 2", + "summary": "This gene encodes a plasma alpha-L-fucosidase, which represents 10-20% of the total cellular fucosidase activity. The protein is a member of the glycosyl hydrolase 29 family, and catalyzes the hydrolysis of the alpha-1,6-linked fucose joined to the reducing-end N-acetylglucosamine of the carbohydrate moieties of glycoproteins. This enzyme is essential for Helicobacter pylori adhesion to human gastric cancer cells. [provided by RefSeq, Aug 2010].", + "symbol": "FUCA2", + "alias": [ + "dJ20N2.5" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.54806826249572, + "first_quartile": 3.41668592265363, + "median": 3.69078894708659, + "mean": 3.67318399089501, + "third_quartile": 3.89709834932296, + "max": 4.68782752662201, + "tissue": "TCX" + }, + { + "min": 2.03791264465019, + "first_quartile": 3.43682539452037, + "median": 3.63555121276883, + "mean": 3.63300507492418, + "third_quartile": 3.86171607052173, + "max": 4.79978134261528, + "tissue": "IFG" + } + ], + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 3, + "abability_bucket": 1, + "pharos_class": "Tchem", + "classification": "Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.", + "safety_bucket_definition": null, + "abability_bucket_definition": "Secreted protein. Highly accessible to antibody-based therapies." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001036" + } + }, + { + "ensembl_gene_id": "ENSG00000001084", + "name": "glutamate-cysteine ligase catalytic subunit", + "summary": "Glutamate-cysteine ligase, also known as gamma-glutamylcysteine synthetase is the first rate-limiting enzyme of glutathione synthesis. The enzyme consists of two subunits, a heavy catalytic subunit and a light regulatory subunit. This locus encodes the catalytic subunit, while the regulatory subunit is derived from a different gene located on chromosome 1p22-p21. Mutations at this locus have been associated with hemolytic anemia due to deficiency of gamma-glutamylcysteine synthetase and susceptibility to myocardial infarction.[provided by RefSeq, Oct 2010].", + "symbol": "GCLC", + "alias": [ + "GLCL", + "GCL", + "GLCLC", + "GCS" + ], + "is_igap": true, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": [ + { + "sm_druggability_bucket": 1, + "safety_bucket": 5, + "abability_bucket": 3, + "pharos_class": null, + "classification": "Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.", + "safety_bucket_definition": "Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.", + "abability_bucket_definition": "Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components." + } + ], + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": true, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22GCLC%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": null + } + }, + { + "ensembl_gene_id": "ENSG00000001167", + "name": "nuclear transcription factor Y subunit alpha", + "summary": "The protein encoded by this gene is one subunit of a trimeric complex, forming a highly conserved transcription factor that binds to CCAAT motifs in the promoter regions in a variety of genes. Subunit A associates with a tight dimer composed of the B and C subunits, resulting in a trimer that binds to DNA with high specificity and affinity. The sequence specific interactions of the complex are made by the A subunit, suggesting a role as the regulatory subunit. In addition, there is evidence of post-transcriptional regulation in this gene product, either by protein degradation or control of translation. Further regulation is represented by alternative splicing in the glutamine-rich activation domain, with clear tissue-specific preferences for the two isoforms. [provided by RefSeq, Jul 2008].", + "symbol": "NFYA", + "alias": [ + "CBF-B", + "NF-YA", + "HAP2", + "CBF-A" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001167" + } + }, + { + "ensembl_gene_id": "ENSG00000001460", + "name": "sperm tail PG-rich repeat containing 1", + "summary": "Involved in positive regulation of apoptotic process and positive regulation of mitochondrial membrane permeability involved in apoptotic process. Predicted to be located in mitochondrion and nucleus. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "STPG1", + "alias": [ + "C1orf201", + "MAPO2" + ], + "is_igap": true, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001460" + } + }, + { + "ensembl_gene_id": "ENSG00000001461", + "name": "NIPA like domain containing 3", + "summary": "Predicted to enable magnesium ion transmembrane transporter activity. Predicted to be involved in magnesium ion transport. Predicted to be integral component of membrane. Predicted to be active in membrane. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": null, + "alias": [ + "DJ462O23.2", + "SLC57A5", + "NPAL3" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001461" + } + }, + { + "ensembl_gene_id": "ENSG00000001497", + "name": "LAS1 like ribosome biogenesis factor", + "summary": "Enables RNA binding activity. Predicted to be involved in maturation of 5.8S rRNA and maturation of LSU-rRNA. Located in membrane. Part of MLL1 complex. Implicated in Wilson-Turner syndrome. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "LAS1L", + "alias": [ + "Las1-like", + "Las1", + "dJ475B7.2", + "MRXSWTS", + "WTS" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": true, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22LAS1L%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001497" + } + }, + { + "ensembl_gene_id": "ENSG00000001561", + "name": "ectonucleotide pyrophosphatase/phosphodiesterase 4", + "summary": "Enables bis(5'-adenosyl)-triphosphatase activity. Involved in positive regulation of blood coagulation and purine ribonucleoside catabolic process. Located in extracellular exosome and membrane. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "ENPP4", + "alias": [ + "NPP4" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001561" + } + }, + { + "ensembl_gene_id": "ENSG00000001617", + "name": "semaphorin 3F", + "summary": "This gene encodes a member of the semaphorin III family of secreted signaling proteins that are involved in axon guidance during neuronal development. The encoded protein contains an N-terminal Sema domain, an immunoglobulin loop and a C-terminal basic domain. This gene is expressed by the endothelial cells where it was found to act in an autocrine fashion to induce apoptosis, inhibit cell proliferation and survival, and function as an anti-tumorigenic agent. Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Jan 2016].", + "symbol": "SEMA3F", + "alias": [ + "SEMA4", + "SEMA-IV", + "SEMAK" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001617" + } + }, + { + "ensembl_gene_id": "ENSG00000001626", + "name": "CF transmembrane conductance regulator", + "summary": "This gene encodes a member of the ATP-binding cassette (ABC) transporter superfamily. The encoded protein functions as a chloride channel, making it unique among members of this protein family, and controls ion and water secretion and absorption in epithelial tissues. Channel activation is mediated by cycles of regulatory domain phosphorylation, ATP-binding by the nucleotide-binding domains, and ATP hydrolysis. Mutations in this gene cause cystic fibrosis, the most common lethal genetic disorder in populations of Northern European descent. The most frequently occurring mutation in cystic fibrosis, DeltaF508, results in impaired folding and trafficking of the encoded protein. Multiple pseudogenes have been identified in the human genome. [provided by RefSeq, Aug 2017].", + "symbol": "CFTR", + "alias": [ + "ABCC7", + "MRP7", + "CFTR/MRP", + "dJ760C5.1", + "TNR-CFTR", + "ABC35", + "CF" + ], + "is_igap": false, + "is_eqtl": true, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001626" + } + }, + { + "ensembl_gene_id": "ENSG00000001629", + "name": "ankyrin repeat and IBR domain containing 1", + "summary": "Predicted to enable ubiquitin conjugating enzyme binding activity and ubiquitin protein ligase activity. Predicted to be involved in positive regulation of proteasomal ubiquitin-dependent protein catabolic process; protein polyubiquitination; and ubiquitin-dependent protein catabolic process. Predicted to be part of ubiquitin ligase complex. Predicted to be active in cytoplasm. [provided by Alliance of Genome Resources, Apr 2022]", + "symbol": "ANKIB1", + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001629" + } + }, + { + "ensembl_gene_id": "ENSG00000001630", + "name": "cytochrome P450 family 51 subfamily A member 1", + "summary": "This gene encodes a member of the cytochrome P450 superfamily of enzymes. The cytochrome P450 proteins are monooxygenases which catalyze many reactions involved in drug metabolism and synthesis of cholesterol, steroids and other lipids. This endoplasmic reticulum protein participates in the synthesis of cholesterol by catalyzing the removal of the 14alpha-methyl group from lanosterol. Homologous genes are found in all three eukaryotic phyla, fungi, plants, and animals, suggesting that this is one of the oldest cytochrome P450 genes. Two transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Mar 2009].", + "symbol": "CYP51A1", + "alias": [ + "P450L1", + "P450-14DM", + "LDM", + "CP51", + "CYPL1", + "CYP51" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001630" + } + }, + { + "ensembl_gene_id": "ENSG00000001631", + "name": "KRIT1 ankyrin repeat containing", + "summary": "This gene encodes a protein containing four ankyrin repeats, a band 4.1/ezrin/radixin/moesin (FERM) domain, and multiple NPXY sequences. The encoded protein is localized in the nucleus and cytoplasm. It binds to integrin cytoplasmic domain-associated protein-1 alpha (ICAP1alpha), and plays a critical role in beta1-integrin-mediated cell proliferation. It associates with junction proteins and RAS-related protein 1A (Rap1A), which requires the encoded protein for maintaining the integrity of endothelial junctions. It is also a microtubule-associated protein and may play a role in microtubule targeting. Mutations in this gene result in cerebral cavernous malformations. Multiple alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Sep 2009].", + "symbol": "KRIT1", + "alias": [ + "CAM", + "CCM1" + ], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": true, + "rna_brain_change_studied": true, + "is_any_protein_changed_in_ad_brain": true, + "protein_brain_change_studied": true, + "target_nominations": null, + "median_expression": [ + { + "min": 2.38916769066134, + "first_quartile": 3.42971842714164, + "median": 3.58242842659155, + "mean": 3.56276683938211, + "third_quartile": 3.71417050416855, + "max": 3.99659888901762, + "tissue": "TCX" + }, + { + "min": 3.0677246327387, + "first_quartile": 3.75499700008387, + "median": 3.88052592466859, + "mean": 3.87774467019765, + "third_quartile": 4.01405541625413, + "max": 4.42188371387498, + "tissue": "IFG" + } + ], + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "111", + "ensembl_possible_replacements": [], + "ensembl_permalink": "https://jan2024.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000001631" + } + }, + { + "ensembl_gene_id": "ENSG00000161149", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": null, + "ensembl_info": { + "ensembl_release": "107", + "ensembl_possible_replacements": [ + "ENSG00000284130" + ], + "ensembl_permalink": "https://jul2022.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000161149" + } + }, + { + "ensembl_gene_id": "ENSG00000183791", + "name": null, + "summary": null, + "symbol": null, + "alias": [], + "is_igap": false, + "is_eqtl": false, + "is_any_rna_changed_in_ad_brain": false, + "rna_brain_change_studied": false, + "is_any_protein_changed_in_ad_brain": false, + "protein_brain_change_studied": false, + "target_nominations": null, + "median_expression": null, + "druggability": null, + "total_nominations": null, + "biodomains": null, + "is_adi": false, + "is_tep": false, + "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22ABCD%22%5D%7D%5D%7D", + "ensembl_info": { + "ensembl_release": "100", + "ensembl_possible_replacements": [ + "ENSG00000288631", + "ENSG00000288616", + "ENSG00000288607" + ], + "ensembl_permalink": "https://apr2020.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000183791" + } + } +] \ No newline at end of file diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py new file mode 100644 index 00000000..0d1e7790 --- /dev/null +++ b/tests/transform/test_gene_info.py @@ -0,0 +1,131 @@ +import json +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import gene_info + + +class TestTransformGeneInfo: + data_files_path = "tests/test_assets/gene_info" + param_set_1 = { + "adjusted_p_value_threshold": 0.05, + "protein_level_threshold": 0.05, + } + param_set_2 = { + "adjusted_p_value_threshold": 1, + "protein_level_threshold": 1, + } + + pass_test_data = [ + ( # Pass with good data on param set 1 + { + "gene_metadata": "gene_metadata_good_input.feather", + "igap": "igap_good_input.csv", + "eqtl": "eqtl_good_input.csv", + "proteomics": "proteomics_good_input.csv", + "diff_exp_data": "diff_exp_data_good_input.csv", + "proteomics_tmt": "proteomics_tmt_good_input.csv", + "proteomics_srm": "proteomics_srm_good_input.csv", + "target_list": "target_list_good_input.csv", + "median_expression": "median_expression_good_input.csv", + "druggability": "druggability_good_input.csv", + "genes_biodomains": "genes_biodomains_good_input.csv", + "tep_adi_info": "tep_adi_info_good_input.csv", + }, + "gene_info_good_output_1.json", + param_set_1, + ), + ( # Pass with good data on param set 2 + { + "gene_metadata": "gene_metadata_good_input.feather", + "igap": "igap_good_input.csv", + "eqtl": "eqtl_good_input.csv", + "proteomics": "proteomics_good_input.csv", + "diff_exp_data": "diff_exp_data_good_input.csv", + "proteomics_tmt": "proteomics_tmt_good_input.csv", + "proteomics_srm": "proteomics_srm_good_input.csv", + "target_list": "target_list_good_input.csv", + "median_expression": "median_expression_good_input.csv", + "druggability": "druggability_good_input.csv", + "genes_biodomains": "genes_biodomains_good_input.csv", + "tep_adi_info": "tep_adi_info_good_input.csv", + }, + "gene_info_good_output_2.json", + param_set_2, + ), + ] + pass_test_ids = [ + "Pass with good data on parameter set 1", + "Pass with good data on parameter set 2", + ] + fail_test_data = [ + ( # Bad data type + "??", + param_set_1, + ValueError, + ), + ] + fail_test_ids = [ + "Fail with bad data type in ?? column", + ] + + def read_input_files_dict(self, input_files_dict): + """Utility function to read a dictionary of filenames into a dictionary of data frames. Most files for + gene_info are in csv format, but the 'gene_metadata' file is in feather format and needs special casing. + + Args: + input_files_dict - a dictionary where keys are the names of the datasets, as expected by + transform_gene_info, and values are the filenames to load + + Returns: + datasets - a dictionary where the keys are the names of the datasets, as expected by + transform_gene_info, and the values are data frames + """ + datasets = {} + for key, value in input_files_dict.items(): + filename = os.path.join(self.data_files_path, "input", value) + if key == "gene_metadata": + datasets[key] = pd.read_feather(filename) + else: + datasets[key] = pd.read_csv(filename) + + return datasets + + @pytest.mark.parametrize( + "input_files_dict, expected_output_file, param_set", + pass_test_data, + ids=pass_test_ids, + ) + def test_transform_gene_info_should_pass( + self, input_files_dict, expected_output_file, param_set + ): + datasets = self.read_input_files_dict(input_files_dict) + + output_df = gene_info.transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=param_set["adjusted_p_value_threshold"], + protein_level_threshold=param_set["protein_level_threshold"], + ) + + json_file = os.path.join(self.data_files_path, "output", expected_output_file) + expected_df = pd.read_json(json_file) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize( + "input_files_dict, param_set, error_type", + fail_test_data, + ids=fail_test_ids, + ) + def test_transform_gene_info_should_fail( + self, input_files_dict, param_set, error_type + ): + with pytest.raises(error_type): + datasets = self.read_input_files_dict(input_files_dict) + + gene_info.transform_gene_info( + datasets=datasets, + adjusted_p_value_threshold=param_set["adjusted_p_value_threshold"], + protein_level_threshold=param_set["protein_level_threshold"], + ) From c346b35362de87a04d83c5d5932d592450d827ac Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 7 Feb 2024 15:28:08 -0800 Subject: [PATCH 02/13] Fixed a bug where resource URL was generated even when is_adi and is_tep were both false --- src/agoradatatools/etl/transform/gene_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index bc30dcd4..86d1504a 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -101,7 +101,7 @@ def transform_gene_info( resource_url_suffix = "%22%5D%7D%5D%7D" tep_info["resource_url"] = tep_info.apply( lambda row: resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix - if row["is_adi"] or row["is_tep"] + if row["is_adi"]==True or row["is_tep"]==True else np.NaN, axis=1, ) From 16f20e089ad91e8bc2d54e891d702c8be0ab6421 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 7 Feb 2024 15:30:40 -0800 Subject: [PATCH 03/13] Added test input files that will cause the gene_info transform to throw errors, created failure case test code --- .../input/diff_exp_data_type_error.csv | 14 +++ .../input/proteomics_srm_type_error.csv | 7 ++ .../input/proteomics_tmt_type_error.csv | 7 ++ .../gene_info/input/proteomics_type_error.csv | 14 +++ .../input/tep_adi_info_good_input.csv | 2 +- .../input/tep_adi_info_type_error.csv | 6 ++ tests/transform/test_gene_info.py | 90 ++++++++++++------- 7 files changed, 105 insertions(+), 35 deletions(-) create mode 100644 tests/test_assets/gene_info/input/diff_exp_data_type_error.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_srm_type_error.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv create mode 100644 tests/test_assets/gene_info/input/proteomics_type_error.csv create mode 100644 tests/test_assets/gene_info/input/tep_adi_info_type_error.csv diff --git a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv new file mode 100644 index 00000000..40d20241 --- /dev/null +++ b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv @@ -0,0 +1,14 @@ +model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.38386144170829,-0.610418505477203,-0.157304377939378,1.0964718993346,-3.32039574535246,0.0009658465939259,string_value,protein_coding,7,DOWN,CFTR,36.5864869618047,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.0960797874215311,0.0277369852074512,0.164422589635611,3.66470161931573,2.79371699517244,0.0054146957075407,0.0128646180744981,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,MAYO +,TCX,AD-CONTROL,ENSG00000001629,0.0688865504482834,-0.0336046266228221,0.171377727519389,6.83925693988808,1.3190719562984,0.187765592003833,0.266881156254369,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,MAYO +Diagnosis,,AD-CONTROL,ENSG00000001460,-0.0428271747433806,-0.134111094837718,0.0484567453509564,4.53063964529137,-0.922749954447199,0.356592428630043,0.451948815550479,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,MAYO +Diagnosis,TCX,,ENSG00000000419,-0.0106100593393075,-0.0731334456790523,0.0519133270004373,4.97936316930159,-0.335784216826404,0.737176767177833,0.798556808985985,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,MAYO +Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.0878179824082075,-0.182373477784594,0.0067375129681794,4.32089559628026,-1.82192928105604,0.0688901841106443,0.155209255305467,,20,NONE,DPM1,39.8497192789902,23689,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001629,0.0804858875345706,0.0073127133165788,0.153659061752562,7.93343801331035,2.16036740054897,0.0310645962980247,0.0847168839428919,protein_coding,,NONE,ANKIB1,36.8283894215301,155410,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.0342369800592941,-0.114230375069546,0.0457564149509574,3.8133273490137,-0.842377670501157,0.399870699901111,0.554357331277541,protein_coding,1,,STPG1,44.0903630539242,59936,,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.0286960913417103,-0.0945307456366793,0.0371385629532588,4.11960673387962,-0.856820989104849,0.391728896583518,0.546803082969529,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL, +Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.0882919008770653,-0.131304264580892,-0.0452795371732381,3.92713408775626,-4.02835272367389,5.78948411033998e-05,0.0006377563132996,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.0841212438799088,0.0354911663215749,0.132751321438243,6.65482116592828,3.39203333183632,0.0007048884218731,0.0045744978486257,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.0398165705036053,-0.0915728594627508,0.0119397184555402,4.01629214749823,-1.50874762983754,0.131493224324285,0.252297356550177,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.0281454867160997,-0.0700675860372825,0.013776612605083,2.93709783776462,-1.31934379342664,0.187178523092241,0.327003220657394,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,ROSMAP diff --git a/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv new file mode 100644 index 00000000..7fe545de --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CD2AP|Q9Y5K6,CD2AP,Q9Y5K6,ENSG00000001630,DLPFC,0.026321079163413537,0.10244645627426802,-0.049804297947440936,0.6959456373623885,string_value +,SNCA,P37840,ENSG00000001629,DLPFC,-0.02568871764608687,0.028374837069623488,-0.07975227236179723,0.5048364890502188,0.7196605269439289 +NDUFA7|O95182,,O95182,ENSG00000000419,DLPFC,-0.08256234002725381,-0.03426991654584031,-0.13085476350866732,0.00019022687174463648,0.0010779522732196067 +DIP2B|Q9P265,DIP2B,,ENSG00000001036,DLPFC,0.002289361997716892,0.07241425980972072,-0.06783553581428692,0.9967679822609551, +VSNL1|P62760,VSNL1,P62760,ENSG00000000971,,-0.013745470109882328,0.014686673450034729,-0.042177613669799385,0.4928746100735819,0.7178825842376084 +SYT11|Q9BT88,SYT11,Q9BT88,ENSG00000000005,DLPFC,0.029120588040412152,0.07403386661338425,-0.015792690532559942,0.2810529496558294,0.4955407270247518 diff --git a/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv new file mode 100644 index 00000000..4219f33d --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +CYP51A1|A0A0C4DFL7,CYP51A1,A0A0C4DFL7,ENSG00000001630,DLPFC,-0.096536825520855,-0.0249064834553162,-0.168167167586394,0.00861397335314594,string_value +ANKIB1|Q9P2G1,ANKIB1,Q9P2G1,ENSG00000001629,,-0.0231875933838178,-0.00261466480061534,-0.0437605219670202,0.0274462753402693,0.124442505184433 +,DPM1,H0Y368,ENSG00000000419,DLPFC,-0.0318866351834386,-0.00176441716357351,-0.0620088532033036,0.038161929543103,0.153081771056205 +FUCA2|Q9BTY2,,Q9BTY2,ENSG00000001036,DLPFC,-0.0889169453741395,0.00602502277420743,-0.183858913522486,0.0661508543031291,0.213410933915364 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,0.156029348948909,-0.00385979845880688,0.0619708516992043,0.204796476548682 +KRIT1|O00522,KRIT1,,ENSG00000001631,DLPFC,0.052873152684011,0.12115434837832,-0.0154080430102979,0.128072571527854,0.318778967948762 diff --git a/tests/test_assets/gene_info/input/proteomics_type_error.csv b/tests/test_assets/gene_info/input/proteomics_type_error.csv new file mode 100644 index 00000000..ad7e56aa --- /dev/null +++ b/tests/test_assets/gene_info/input/proteomics_type_error.csv @@ -0,0 +1,14 @@ +uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +DPM1|O60762,DPM1,O60762,ENSG00000000419,DLPFC,-0.000532480602341678,0.108873058799453,-0.109938020004136,0.999999279258724,string_value +GCLC|P48506,GCLC,P48506,ENSG00000001084,DLPFC,0.172360117768928,0.241008334458834,0.103711901079023,3.16356041363264e-09,6.82372624103319e-08 +CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,DLPFC,,,,, +GCLC|P48506,GCLC,P48506,ENSG00000001084,MFG,0.0113505662359543,0.230040224838345,-0.207339092366436,0.991302618555794,0.999999988150877 +CFH|P08603,CFH,P08603,ENSG00000000971,,0.00509733585480276,0.799985494934676,-0.789790823225071,0.999866632256444,0.999999988150877 +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,MFG,,,,, +GCLC|P48506,GCLC,P48506,ENSG00000001084,TCX,-0.0429270360844782,0.0424170350995262,-0.128271107268483,0.461818930212383,0.999999901144234 +CFH|P08603,CFH,P08603,ENSG00000000971,TCX,,,,, +CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,TCX,,,,, +GCLC|P48506,,P48506,ENSG00000001084,AntPFC,-0.0232034474563143,0.0767886612547822,-0.123195556167411,0.848120820467051,0.999999904091243 +CFH|P08603,CFH,P08603,ENSG00000000971,AntPFC,,,,, +,CYP51A1,Q16850,ENSG00000001630,AntPFC,0.264726640853272,0.566731977706236,-0.0372786959996912,0.0989464176618443,0.564688784959532 diff --git a/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv index fdeaf6d5..ae081666 100644 --- a/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv +++ b/tests/test_assets/gene_info/input/tep_adi_info_good_input.csv @@ -1,6 +1,6 @@ ensembl_gene_id,hgnc_symbol,is_adi,is_tep ENSG00000000005,TNMD,True,True -ENSG00000000419,DPM1,, +ENSG00000000419,DPM1,,False ENSG00000001497,LAS1L,True, ENSG00000001084,GCLC,,True ENSG00000183791,ABCD,, diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error.csv new file mode 100644 index 00000000..4a4732a0 --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error.csv @@ -0,0 +1,6 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,,True,True +ENSG00000000419,DPM1,,False +ENSG00000001497,LAS1L,True, +ENSG00000001084,GCLC,,True +ENSG00000183791,ABCD,, diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index 0d1e7790..2e3642f0 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -18,40 +18,29 @@ class TestTransformGeneInfo: "protein_level_threshold": 1, } + core_files = { + "gene_metadata": "gene_metadata_good_input.feather", + "igap": "igap_good_input.csv", + "eqtl": "eqtl_good_input.csv", + "proteomics": "proteomics_good_input.csv", + "diff_exp_data": "diff_exp_data_good_input.csv", + "proteomics_tmt": "proteomics_tmt_good_input.csv", + "proteomics_srm": "proteomics_srm_good_input.csv", + "target_list": "target_list_good_input.csv", + "median_expression": "median_expression_good_input.csv", + "druggability": "druggability_good_input.csv", + "genes_biodomains": "genes_biodomains_good_input.csv", + "tep_adi_info": "tep_adi_info_good_input.csv", + } + pass_test_data = [ ( # Pass with good data on param set 1 - { - "gene_metadata": "gene_metadata_good_input.feather", - "igap": "igap_good_input.csv", - "eqtl": "eqtl_good_input.csv", - "proteomics": "proteomics_good_input.csv", - "diff_exp_data": "diff_exp_data_good_input.csv", - "proteomics_tmt": "proteomics_tmt_good_input.csv", - "proteomics_srm": "proteomics_srm_good_input.csv", - "target_list": "target_list_good_input.csv", - "median_expression": "median_expression_good_input.csv", - "druggability": "druggability_good_input.csv", - "genes_biodomains": "genes_biodomains_good_input.csv", - "tep_adi_info": "tep_adi_info_good_input.csv", - }, + core_files, "gene_info_good_output_1.json", param_set_1, ), ( # Pass with good data on param set 2 - { - "gene_metadata": "gene_metadata_good_input.feather", - "igap": "igap_good_input.csv", - "eqtl": "eqtl_good_input.csv", - "proteomics": "proteomics_good_input.csv", - "diff_exp_data": "diff_exp_data_good_input.csv", - "proteomics_tmt": "proteomics_tmt_good_input.csv", - "proteomics_srm": "proteomics_srm_good_input.csv", - "target_list": "target_list_good_input.csv", - "median_expression": "median_expression_good_input.csv", - "druggability": "druggability_good_input.csv", - "genes_biodomains": "genes_biodomains_good_input.csv", - "tep_adi_info": "tep_adi_info_good_input.csv", - }, + core_files, "gene_info_good_output_2.json", param_set_2, ), @@ -61,14 +50,43 @@ class TestTransformGeneInfo: "Pass with good data on parameter set 2", ] fail_test_data = [ - ( # Bad data type - "??", + ( # Bad data type in diff_exp_data + core_files, + {"diff_exp_data": "diff_exp_data_type_error.csv"}, + param_set_1, + TypeError, + ), + ( # Bad data type in proteomics + core_files, + {"proteomics": "proteomics_type_error.csv"}, + param_set_1, + TypeError, + ), + ( # Bad data type in proteomics_tmt + core_files, + {"proteomics_tmt": "proteomics_tmt_type_error.csv"}, param_set_1, - ValueError, + TypeError, + ), + ( # Bad data type in proteomics_srm + core_files, + {"proteomics_srm": "proteomics_srm_type_error.csv"}, + param_set_1, + TypeError, + ), + ( # Missing HGNC in tep_adi_info + core_files, + {"tep_adi_info": "tep_adi_info_type_error.csv"}, + param_set_1, + TypeError, ), ] fail_test_ids = [ - "Fail with bad data type in ?? column", + "Fail with bad data type in diff_exp_data's adj_p_val column", + "Fail with bad data type in proteomics's cor_pval column", + "Fail with bad data type in proteomics_tmt's cor_pval column", + "Fail with bad data type in proteomics_srm's cor_pval column", + "Fail with missing hgnc_symbol in tep_adi_info", ] def read_input_files_dict(self, input_files_dict): @@ -114,13 +132,17 @@ def test_transform_gene_info_should_pass( pd.testing.assert_frame_equal(output_df, expected_df) @pytest.mark.parametrize( - "input_files_dict, param_set, error_type", + "input_files_dict, failure_case_files_dict, param_set, error_type", fail_test_data, ids=fail_test_ids, ) def test_transform_gene_info_should_fail( - self, input_files_dict, param_set, error_type + self, input_files_dict, failure_case_files_dict, param_set, error_type ): + # Any files specified in 'failure_case_files_dict' will replace their default "good" files in input_files_dict + for key, value in failure_case_files_dict.items(): + input_files_dict[key] = value + with pytest.raises(error_type): datasets = self.read_input_files_dict(input_files_dict) From 19d52a97a8f828ed4e8c265abceaf1e57ba58dad Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 7 Feb 2024 15:33:53 -0800 Subject: [PATCH 04/13] Removed unused test asset file --- .../input/gene_metadata_unaltered.feather | Bin 13154 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/test_assets/gene_info/input/gene_metadata_unaltered.feather diff --git a/tests/test_assets/gene_info/input/gene_metadata_unaltered.feather b/tests/test_assets/gene_info/input/gene_metadata_unaltered.feather deleted file mode 100644 index 0e2da1ecf95f8d30a3703e57cee0cf5faf68b013..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13154 zcmeHud2}4rneTUN=|yU(TZ?7c2D>a4TR|;#wH7a6tX6BaCApDgj4_T?>MpekcXu^a z)wUakV910(W|El9u=$0=5C|~|3rWRd{`ndF6llL-VuV3;f?nUev8aUcW+=68Fs zWjoBgNjT^I!l zkIMWV*#J-X;Fss28v1N0?^AeQ)?{Q7`mSwpGnGqc?55Sqmdl#9BMom&mX=BCO)W}O zMoa5R$mCKf#B4{i97IEfcQcFEaPeK|@9O1Skx{I0TcLKnRPXWCOdEPCvF-S@td>b= z_BJP<>$Cq%MIbE3DlUqG8_x){KOBtGda-&C+PiQS=*v^&`scq$-W&UScJ83QL zTso~;`IB3nlW{`3|2yhFsl_t=T6#*??7}Ab)AT#1!28>StZCcER7&4w>DiPP*VB5Y zuqTi2#(R}Ip)c>tDr9vljb+YkKY1OF%Q~U$KVu~dJkHWJ$FvHpCuK}oeEq?BvU$z2 z%$>XoJ9Nu7%nVOeJ6c1n;U+ME^3I-{*&1%`Xu6!h*@=psBl6;X77Y9W;2K~8cpGVl zI1dLp0M6rk-+A|4aC(``hf_*vcoe=rI%>p2D3>jk_0=QZ0+eOM>3HX9e`>LiOeuW# zl{xJ+B8`8`1AiKhp62g=k1{`*Q%*rzxtt&GClDqTe<&p>vf}iY4^~7pj=C_ zT%qsfabije{e~!`!v4my5#=s;k2>e}wHsx2&nc(i+j3vI%$&=l5Tz{2-aDuK+`jU4 zE3#6en^5MN_bA8f=M0|L_Ylei(0dP7-CdB?c9-I%=+XGarG(iN^2_mMrG7SYf<(xa;@5Cx3ZdNomrC(+eT{qrBQ%yKG<0e) zUj8k|%l+L~%5N$a_}&TS%keWx`LieFm*Z!Z@|DHKm(`{CWGP;XQaN63|4=Fat#Sc8 zD}lN6z_ZHn(z5+O$N9HBM2nB3zvcQ$%YC%Ia)EMwIbK@ka`dF~6QvxfX{2$o`zl0t z^Ax^?_y~i*Ily}0Lcjoa0Url$0PX-D03HE;4E!4~3xG#aHJ1KVpru3tNIDiNwUcP*?e>xpRe|mUo2^U)8?SPs>)Qei~9k?OjM-*?}Lm>q1(nUP!Cd zj;{68+_5T13p*E6bC=40!n^Qw_{X$T?P`C7y6|hz`D&M{b_8isUD>{36)c5fD9>Z5 zKZt_Ml@n3qJ?4imbwBW7;6~sv;2GdQfcY426fl6RfxCd81OE>AK?kdVByjZ{;Im@j zi9D40!&%MiuRD6iOq+=ugmuD98S$BXDwv%y?d*)^=t^44WK27sa$thk)4B!yMzM2K zkkgK$wkUBZ{HeTS#K)tXRV8I?*OjcT=Mp9?7hbfze`sKISXGRSgKy&wOajHylerWZ z*NQofNZd?Mp|At#tib3{(uQN&hTT6frerPCF@q*-3C?L-lyNJrOlxT)l~;kvZ;dFg9340_n?--2!g}G>0BHoVa(`CIGM^hS{maD#`CtL^Y2qe5?S9T=>@tn zlZ!)xvy?X}61R-(tt4)x`psw3;iXEUHk%7^#nFtE5*r9wm@D)@3)Mr3n;9f$`0&)x zp;+WJHj^;ZNI8|p+J__K>Zx?VGNw%1OzX;&@dmYHh&w3)+sV9+0PEM4wy$8mc%kLn9ykd4{3KA1`}39-%VF1a&bq?#P#c0x0czS50UsV zUBM@EsAuecP@EdiJ7#>w`ZD##IyyspnbN7A%EUXF5>YbrW!+NL^1|ehzj}}2CWhWkciv5xV+ zkqDp7iLuD&`k@{*x;fUVMv8M$Y=-{K@mJ$>t`_kzt_1jA=39LV;G+2>02j@BfiD4H z10Dr9k9!e#8F+gR@LCK!k%#jB#p^44kARSaMSHuNo{kj^??Sl~_{%sFUu)$#d|)!b zaIV}0oCz!k&I0%N1nmzFw-5G*BS=;84n#(JVrpVtI5Zk=?Hm}6 z4r2?sX;zF5M8~5&{lT8m{@~`wU?di_;9py zMWk~y7QuGYY&FQmf)|oIv94o9XU~e(@W|K&sEAyqMfbk#OPRX9vwn~_ReTO=Vil~msu86J9asJkxDO zD+oYrZpDEtR*($`0)$sWaZDvXV`dUoS#k%5T9lnL5J+dzo0S^GvgH^lEvW|~vf{?H z5m%C_eB6xDp6Of$V%W^s8z^QwMmo2Zq|Nki5E8nQgt6?*KnmMfaw9WiOhL*!xhY+l zHet@MrB^*%X6sNCoEaUyos<&SQz^Sud6J|omI7VWQnra2@GohYzo8DuxiLD-OsiEH zz~G=NC_6OExPaInNUCSlL+9DH88?V2JB=GCrdWo({aSjp79u_ik*y>QZAy1^8_EW} z8W!3#r*zwJ@(1Z5Teos)C8h7sQ}$PB)@yUAo<>}RwavzJLgVd!G?80*H!l`ENaKiikq;?P?xC84jR z8ta!dySN1lW9E{WG4v9ej00^Vvt60dY$dN_=1c`sm(bIgZO3|(W|y`qTfnb(K-edg zsk}0#Pmk-DK(EoWaNLGgZEfu>Xkcbp1N#(LfM;bBy32B81!KVxEz_)E*gG_VT7+1_ z%$Qby3C(mUp8|P6(aC{w%u9tAq1xo&2f}+ z*{#EL*o)=2^BIr^pW}ow4f0e5dSk(Vq8j~oY;nTP#K$Sd^$~k9d7mJqZ)^h-o}uWa z`LrIP#rZ)t>pxD{Y*hwxP(c+g;uUzdw#V2Z5cib1D<6bLnKo81rc$<~4qZrC&Pibs z%8zX%6|!vok{ou^H;MgPtVw=CWWQwBp{j7JGLmytf%UL?PP4Q|v-Ae>RT0_`_MZWb z7RzayRziQCv3Erk>hAyvA@RN;y;X^%&~8TC!**q%EJNYDhh3G%gc;C3AEPm@H#)b8 zYcr1hNkO`G)ZBrURXew~!t{Sx@PnI(2gS_=rF9-wVA{+h=~nkzN}sA*%$1A?@`%d zSh;x~d8=N#i#+HvoBh44OFNvpe zV1<^>s?YMW7U5Od*$Hk3?uQ*)Fjs6i06)ZNOiC!r$D?D-dj*;=RNUo>*?Qd4uaF}! z*-T;wrc$Pv0Q=Il^bVq1-FvVb!2(d(5l;v!@T|asRBQc^#3x-1@h51W_5`t;g%~C; z?tDjJpJy>61M}`BclR`!{98{pKBHv}J1u)A!Kp2CYKH2KDj{J1fcfiPg6nh47xF$l z1Fl&xEQ2}g51yrVzNg}KFSsC7m1fA61mvG(;!sMN$!E=P5o@K%jGjf>O+rK7CwmsS zmKx?0F6wtJ(boxVGGR=^dVPjTZ%LkEA%s30k`>*0+U?)sl0qBePQBcyZV=ppK%e#; zk+q`1cbxgSs1lph@Z5``+lm?rhC^qx%k&}xpKo*1qayo)*R`622iRGLvqth?BZCCp zS#%I$I`x$6wD79EvM?xVMVr}@PMgt}Q6Mi@iXS84LF!gpx`J{A{merT6XeFlrjdEA zlD-In8l0%VP z@?rL3Xlv`hF`N10mGn(gVKLv$*oS=G+cW0QM}#Ytq=plhV6u(Qfa`z+u_EBiuv%k` zO->t@?I_DC&w~z_Hnxc3v&7zjg758|G0bylscpcQ(<89gytGUTgJ@cnT5)!va-Pq>21QVXBg21jCB6gcV(_Z!q zCSKr@2jS}WbtiPo=ybDJDyYpT4U25jOCXN>q<}dn(!A<`{ZsK8H5(2GvthMGYSNsn zBwSDapcL2#YJ*Av37=ao-U?|BWy*j|$$I>^Fh6aSUbJ&|T+cdCmQv74Rta`G54K|^ z{zY1d667lOBrz8Y829<;0w4PZqnCtBGzEJmt$$8n z*UHMSmZv;{^iKIIOaX=qkB2K@ZDs5#H~V!3eMj!*ZjOtp>970|cux{%dB&sBNMvHt zLkB(dBnxCMMxSx1@U`6Qp%*KeAUEmGGKPJtLtPv15*i|3trUOmQPFCz5J=oj43q{% zBXN*ul4^Qpg!J?8ws`78trl*a}vy%K# zpR|nFEgsqKueuhZ4u>F4@CFvpK3~n2iu47>Y`1h;AOt1qT<`7{MK_>q zSf3;Dw-V+2VM|MNkYZYgS;WDfn`HDvmC#UiUaQidvsRO53PbaXD_lZWaIf|XFS%$0 zlMMsaMNdoe$1B*mE@1~rN+tWe=j%mRkYm_(?zS2gdXM6+i(gs^??k7w(`DvdvX=IXw9Cg9SH`(7 zN!NZT&Nm#Y+Df}yl^8a<5&u7u@WX(#Us`TtK1^b`GF9+O<<4<+M^Y){LKoe`LeQk2 zm9k^H4fP>@TjkA>tOYeA9Nu~{#h~&{rS1>{$(h;^)VrX-1_k-M#HuNjHLV9l_9e!u zL~lO@rheje-|ed&<=z*6CvGS_Te(zAsiI5h;~RsEs2%#Hu}jqI8trtC%gy{}hynIWWgi@H+6)d+SSXvbzUYZ?U)z3xwpIu~aMw)5 zYQ^(J?*g__-vtBWf{G@6*G+C3CFvT${|24L?)R}|y|l?Ae5cyg82IHJciYg=(6trx zJd;UbCUcbZGp{1TC*O~y^ zK98npk7ER#TpGgRj=Dg+sex88|Htc^1)rC$XRdh_{&rVYjdoFOpliY_^)mmEAooTl zV33}$LLiYUMlb`LuZF%gEC$xpQc-X?Q!q-3n>z&86|`m_O-2imC;Vvi`q|R;dg>m zPh;8?^!1;4DN+&0p5b8!8ii}zJz#XvzaioGRl?WZry2JAM&WIDD4vgN@^W+flgYdQM|4FUW6 z?$ER`^-^7p9rboAY12L&IFLr4VT{69Pkp38x~^AGIa+^bXyt4` zy1h1Lq_dh)T?flFmFOipz%Bt%gA2z6LCjPL*9me&peiQ*W9?YM7vHp$-X%EW}RzewKPwZwz$|kqAN_| zMww2nraP*oGpYI=wS2) zKcg$$(Q~^*6>i}Veo+qiT1XscHMZZq2h(lB5KQWKH%gB;2#@&#mrhs2GI^+J&ysN4 z0#|!QC}HIeO2Q4S)?QODd@~@vSwml06wsfjkC-4k4=G0CEVr6a%uo2vOBq=s@dct^ zx%|!|LA(o{Vi8FD?D^HKO$z8+y{vf=yI*p{5qV0L_z($K)zflcC~fWtpDH-jxLL5L z_8{&-;RuEMQgYeL1Cf-LG4G;7DOj>_)}+j&nX{8(t=`InEpC$hhbfhU8^O|HUFWvr z;=(T9L-<~6QQ)ab8Z8ghF|&aVQyxk)Cfe?+W}oq}Hzo0Tzj&(D4I0q@U8?*AqdG6O zvZkHpEiU%BUw*p5{~@>6JtT3P4|W)Qr4v+nn;@MlDI?nDKH7CE+sELd781D0#^lRH z`kYJL-^iX{*v*d-Zt@DBYM?*LiZVqk($T86{lmf|t~N+mW0#R>OPjFqGs^OQ*q{d* zgceZjyMDT95j^V`Z2l)dT&Mn)MU1^t!(M5WPm$RtT(!CF0=+GW*NXFpeisn_+2`3W z%9nfR2QO+6cMDBH_0oBlD9hp8jH%}ewaxI=>Wk>Mib)vM@EINSi*57CyWn&+I779; z>V<83VvCp9UXpVpcGc8f{IDWj%!DUsax5|)wDcb+@@MIQqDGYEV_H^?G+$iDd=1@~ zm5K!3H=9_SSinchMvcfGlV+E_&Nl<9t;XeizZN{C&qfKUk&0Apw7u{DKqkQ}U& zqvtB{Hx@kiOYuA%SO)O32Y$ZFz4z@v7qAN8=dDLWIZ|qm`{B%# zAg};OB3a;{fyJ2c9Pl`w_;M&mO5^(#(vPg2ah$BZx~=VEEdv#-qgAuwGw_+UmM++| znpU!nZ~bkymWAP9w{bUU+j#Mo+SS_u^Iwtx`2k+&ScT!K9C4v#qx0ZcRxbML(rQ}}O1;rkEn%YJZQmfsyd>2~F@ zd$GsgW;~Yum){|rdlz`_WwVnnFP@nD{4?TJHkIVYYSK@>FvPV@GE~h@YFZ=)RzU(0MgGk~3 G*8CSYEx*+O From 63fab64b8ab87e1a072360a06b2a4d009e696a37 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 7 Feb 2024 15:41:48 -0800 Subject: [PATCH 05/13] Updated expected output files for gene_info testing to work with tep_adi bug fix --- .../test_assets/gene_info/output/gene_info_good_output_1.json | 4 ++-- .../test_assets/gene_info/output/gene_info_good_output_2.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_1.json b/tests/test_assets/gene_info/output/gene_info_good_output_1.json index 274dee47..69a87922 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_1.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_1.json @@ -152,7 +152,7 @@ ], "is_adi": false, "is_tep": false, - "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22DPM1%22%5D%7D%5D%7D", + "resource_url": null, "ensembl_info": { "ensembl_release": "111", "ensembl_possible_replacements": [], @@ -811,7 +811,7 @@ "biodomains": null, "is_adi": false, "is_tep": false, - "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22ABCD%22%5D%7D%5D%7D", + "resource_url": null, "ensembl_info": { "ensembl_release": "100", "ensembl_possible_replacements": [ diff --git a/tests/test_assets/gene_info/output/gene_info_good_output_2.json b/tests/test_assets/gene_info/output/gene_info_good_output_2.json index 91bb925b..e5572df5 100644 --- a/tests/test_assets/gene_info/output/gene_info_good_output_2.json +++ b/tests/test_assets/gene_info/output/gene_info_good_output_2.json @@ -152,7 +152,7 @@ ], "is_adi": false, "is_tep": false, - "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22DPM1%22%5D%7D%5D%7D", + "resource_url": null, "ensembl_info": { "ensembl_release": "111", "ensembl_possible_replacements": [], @@ -811,7 +811,7 @@ "biodomains": null, "is_adi": false, "is_tep": false, - "resource_url": "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22ABCD%22%5D%7D%5D%7D", + "resource_url": null, "ensembl_info": { "ensembl_release": "100", "ensembl_possible_replacements": [ From 93b40c4a82439024153fc9d71b8be2e09806a781 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 9 Feb 2024 16:15:16 -0800 Subject: [PATCH 06/13] Simplified gene info test asset files to have fewer lines for the failure cases --- .../gene_info/input/diff_exp_data_type_error.csv | 11 ----------- .../gene_info/input/genes_biodomains_good_input.csv | 1 + .../gene_info/input/proteomics_srm_type_error.csv | 4 ---- .../gene_info/input/proteomics_tmt_type_error.csv | 4 ---- .../gene_info/input/proteomics_type_error.csv | 11 ----------- ...o_type_error.csv => tep_adi_info_type_error_1.csv} | 3 --- .../gene_info/input/tep_adi_info_type_error_2.csv | 3 +++ .../gene_info/input/tep_adi_info_type_error_3.csv | 3 +++ 8 files changed, 7 insertions(+), 33 deletions(-) rename tests/test_assets/gene_info/input/{tep_adi_info_type_error.csv => tep_adi_info_type_error_1.csv} (55%) create mode 100644 tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv create mode 100644 tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv diff --git a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv index 40d20241..f2c78adb 100644 --- a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv +++ b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv @@ -1,14 +1,3 @@ model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.38386144170829,-0.610418505477203,-0.157304377939378,1.0964718993346,-3.32039574535246,0.0009658465939259,string_value,protein_coding,7,DOWN,CFTR,36.5864869618047,250188,ALL,MAYO Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.0960797874215311,0.0277369852074512,0.164422589635611,3.66470161931573,2.79371699517244,0.0054146957075407,0.0128646180744981,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,MAYO -,TCX,AD-CONTROL,ENSG00000001629,0.0688865504482834,-0.0336046266228221,0.171377727519389,6.83925693988808,1.3190719562984,0.187765592003833,0.266881156254369,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,MAYO -Diagnosis,,AD-CONTROL,ENSG00000001460,-0.0428271747433806,-0.134111094837718,0.0484567453509564,4.53063964529137,-0.922749954447199,0.356592428630043,0.451948815550479,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,MAYO -Diagnosis,TCX,,ENSG00000000419,-0.0106100593393075,-0.0731334456790523,0.0519133270004373,4.97936316930159,-0.335784216826404,0.737176767177833,0.798556808985985,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,MAYO -Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.0878179824082075,-0.182373477784594,0.0067375129681794,4.32089559628026,-1.82192928105604,0.0688901841106443,0.155209255305467,,20,NONE,DPM1,39.8497192789902,23689,ALL,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001629,0.0804858875345706,0.0073127133165788,0.153659061752562,7.93343801331035,2.16036740054897,0.0310645962980247,0.0847168839428919,protein_coding,,NONE,ANKIB1,36.8283894215301,155410,ALL,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.0342369800592941,-0.114230375069546,0.0457564149509574,3.8133273490137,-0.842377670501157,0.399870699901111,0.554357331277541,protein_coding,1,,STPG1,44.0903630539242,59936,,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.0286960913417103,-0.0945307456366793,0.0371385629532588,4.11960673387962,-0.856820989104849,0.391728896583518,0.546803082969529,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL, -Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.0882919008770653,-0.131304264580892,-0.0452795371732381,3.92713408775626,-4.02835272367389,5.78948411033998e-05,0.0006377563132996,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.0841212438799088,0.0354911663215749,0.132751321438243,6.65482116592828,3.39203333183632,0.0007048884218731,0.0045744978486257,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.0398165705036053,-0.0915728594627508,0.0119397184555402,4.01629214749823,-1.50874762983754,0.131493224324285,0.252297356550177,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.0281454867160997,-0.0700675860372825,0.013776612605083,2.93709783776462,-1.31934379342664,0.187178523092241,0.327003220657394,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,ROSMAP diff --git a/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv index 86424daf..0c111a10 100644 --- a/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv +++ b/tests/test_assets/gene_info/input/genes_biodomains_good_input.csv @@ -4,3 +4,4 @@ Proteostasis,Pr,Proteostasis [Pr],#c8b269,,endopeptidase activity,73.0,TNMD,ENSG Apoptosis,Ap,Apoptosis [Ap],,GO:0006915,apoptotic process,577.0,DPM1,ENSG00000000419 Structural Stabilization,,,#ff9a9a,GO:0030863,cortical cytoskeleton,21.0,SCYL3,ENSG00000000457 Synapse,Sy,Synapse [Sy],#329a33,GO:0034704,,,TNMD,ENSG00000000005 +,Sy,Synapse [Sy],#329a33,GO:0034704,,,TNMD,ENSG00000000005 diff --git a/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv index 7fe545de..4500680e 100644 --- a/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv +++ b/tests/test_assets/gene_info/input/proteomics_srm_type_error.csv @@ -1,7 +1,3 @@ uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval CD2AP|Q9Y5K6,CD2AP,Q9Y5K6,ENSG00000001630,DLPFC,0.026321079163413537,0.10244645627426802,-0.049804297947440936,0.6959456373623885,string_value ,SNCA,P37840,ENSG00000001629,DLPFC,-0.02568871764608687,0.028374837069623488,-0.07975227236179723,0.5048364890502188,0.7196605269439289 -NDUFA7|O95182,,O95182,ENSG00000000419,DLPFC,-0.08256234002725381,-0.03426991654584031,-0.13085476350866732,0.00019022687174463648,0.0010779522732196067 -DIP2B|Q9P265,DIP2B,,ENSG00000001036,DLPFC,0.002289361997716892,0.07241425980972072,-0.06783553581428692,0.9967679822609551, -VSNL1|P62760,VSNL1,P62760,ENSG00000000971,,-0.013745470109882328,0.014686673450034729,-0.042177613669799385,0.4928746100735819,0.7178825842376084 -SYT11|Q9BT88,SYT11,Q9BT88,ENSG00000000005,DLPFC,0.029120588040412152,0.07403386661338425,-0.015792690532559942,0.2810529496558294,0.4955407270247518 diff --git a/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv index 4219f33d..125040ad 100644 --- a/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv +++ b/tests/test_assets/gene_info/input/proteomics_tmt_type_error.csv @@ -1,7 +1,3 @@ uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval CYP51A1|A0A0C4DFL7,CYP51A1,A0A0C4DFL7,ENSG00000001630,DLPFC,-0.096536825520855,-0.0249064834553162,-0.168167167586394,0.00861397335314594,string_value ANKIB1|Q9P2G1,ANKIB1,Q9P2G1,ENSG00000001629,,-0.0231875933838178,-0.00261466480061534,-0.0437605219670202,0.0274462753402693,0.124442505184433 -,DPM1,H0Y368,ENSG00000000419,DLPFC,-0.0318866351834386,-0.00176441716357351,-0.0620088532033036,0.038161929543103,0.153081771056205 -FUCA2|Q9BTY2,,Q9BTY2,ENSG00000001036,DLPFC,-0.0889169453741395,0.00602502277420743,-0.183858913522486,0.0661508543031291,0.213410933915364 -CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,0.156029348948909,-0.00385979845880688,0.0619708516992043,0.204796476548682 -KRIT1|O00522,KRIT1,,ENSG00000001631,DLPFC,0.052873152684011,0.12115434837832,-0.0154080430102979,0.128072571527854,0.318778967948762 diff --git a/tests/test_assets/gene_info/input/proteomics_type_error.csv b/tests/test_assets/gene_info/input/proteomics_type_error.csv index ad7e56aa..7fbfab60 100644 --- a/tests/test_assets/gene_info/input/proteomics_type_error.csv +++ b/tests/test_assets/gene_info/input/proteomics_type_error.csv @@ -1,14 +1,3 @@ uniqid,genename,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval DPM1|O60762,DPM1,O60762,ENSG00000000419,DLPFC,-0.000532480602341678,0.108873058799453,-0.109938020004136,0.999999279258724,string_value GCLC|P48506,GCLC,P48506,ENSG00000001084,DLPFC,0.172360117768928,0.241008334458834,0.103711901079023,3.16356041363264e-09,6.82372624103319e-08 -CFH|P08603,CFH,P08603,ENSG00000000971,DLPFC,,,,, -CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,DLPFC,,,,, -GCLC|P48506,GCLC,P48506,ENSG00000001084,MFG,0.0113505662359543,0.230040224838345,-0.207339092366436,0.991302618555794,0.999999988150877 -CFH|P08603,CFH,P08603,ENSG00000000971,,0.00509733585480276,0.799985494934676,-0.789790823225071,0.999866632256444,0.999999988150877 -CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,MFG,,,,, -GCLC|P48506,GCLC,P48506,ENSG00000001084,TCX,-0.0429270360844782,0.0424170350995262,-0.128271107268483,0.461818930212383,0.999999901144234 -CFH|P08603,CFH,P08603,ENSG00000000971,TCX,,,,, -CYP51A1|Q16850,CYP51A1,Q16850,ENSG00000001630,TCX,,,,, -GCLC|P48506,,P48506,ENSG00000001084,AntPFC,-0.0232034474563143,0.0767886612547822,-0.123195556167411,0.848120820467051,0.999999904091243 -CFH|P08603,CFH,P08603,ENSG00000000971,AntPFC,,,,, -,CYP51A1,Q16850,ENSG00000001630,AntPFC,0.264726640853272,0.566731977706236,-0.0372786959996912,0.0989464176618443,0.564688784959532 diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv similarity index 55% rename from tests/test_assets/gene_info/input/tep_adi_info_type_error.csv rename to tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv index 4a4732a0..ec127faa 100644 --- a/tests/test_assets/gene_info/input/tep_adi_info_type_error.csv +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_1.csv @@ -1,6 +1,3 @@ ensembl_gene_id,hgnc_symbol,is_adi,is_tep ENSG00000000005,,True,True ENSG00000000419,DPM1,,False -ENSG00000001497,LAS1L,True, -ENSG00000001084,GCLC,,True -ENSG00000183791,ABCD,, diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv new file mode 100644 index 00000000..061eb48a --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_2.csv @@ -0,0 +1,3 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,string_value,True +ENSG00000000419,DPM1,,False diff --git a/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv b/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv new file mode 100644 index 00000000..c4703236 --- /dev/null +++ b/tests/test_assets/gene_info/input/tep_adi_info_type_error_3.csv @@ -0,0 +1,3 @@ +ensembl_gene_id,hgnc_symbol,is_adi,is_tep +ENSG00000000005,TNMD,True,string_value +ENSG00000000419,DPM1,,False From 6a0b669e4fb57fb2ccd882ccb0145ce26d342fd7 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 9 Feb 2024 16:16:12 -0800 Subject: [PATCH 07/13] Changed argument for nest_fields from string to list in genes_biodomains transform to conform to the function definition --- src/agoradatatools/etl/transform/genes_biodomains.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agoradatatools/etl/transform/genes_biodomains.py b/src/agoradatatools/etl/transform/genes_biodomains.py index 9d278cda..c6361d3d 100644 --- a/src/agoradatatools/etl/transform/genes_biodomains.py +++ b/src/agoradatatools/etl/transform/genes_biodomains.py @@ -103,7 +103,7 @@ def transform_genes_biodomains(datasets: dict) -> pd.DataFrame: df=genes_biodomains, grouping="ensembl_gene_id", new_column="gene_biodomains", - drop_columns="ensembl_gene_id", + drop_columns=["ensembl_gene_id"], ) return genes_biodomains From defebc8a42693b0f4f5f3bd8f2e3fbb2b9a86399 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 9 Feb 2024 16:18:34 -0800 Subject: [PATCH 08/13] gene_info transform edits: fixed case where ensembl_info is null instead of a dict, biodomains now drops rows with NaN biodomain names, added check for boolean values in tep_info. Updated test to work with these fixes --- src/agoradatatools/etl/transform/gene_info.py | 43 +++++++++++++++---- tests/transform/test_gene_info.py | 29 +++++++++++-- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 86d1504a..644ffde3 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -81,6 +81,7 @@ def transform_gene_info( drop_columns=["ensembl_gene_id"], ) + biodomains = biodomains.dropna(subset=["biodomain", "ensembl_gene_id"]) biodomains = ( biodomains.groupby("ensembl_gene_id")["biodomain"] .apply(set) # ensure unique biodomain names @@ -92,6 +93,15 @@ def transform_gene_info( # sort biodomains list alphabetically biodomains["biodomains"] = biodomains["biodomains"].apply(sorted) + # Type-check the 'is_adi' and 'is_tep' columns of tep_info to make sure they are booleans and not strings. + # Explicitly make NaN is_adi and is_tep values "False" to avoid having to check for boolean and NaN in the + # check below. + tep_info = tep_info.fillna({"is_adi": False, "is_tep": False}) + if tep_info["is_adi"].dtype != bool: + raise TypeError("wrong data type in 'is_adi' column") + if tep_info["is_tep"].dtype != bool: + raise TypeError("wrong data type in 'is_tep' column") + # For genes with either is_adi or is_tep set to True, create a resource URL that opens # the portal page to the specific gene. This must be done using the hgnc_symbol from the # tep_info file and not the symbol in gene_info, because there are some mismatches @@ -99,10 +109,13 @@ def transform_gene_info( # resource_url should be NA if both is_adi and is_tep are false. resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" resource_url_suffix = "%22%5D%7D%5D%7D" + tep_info["resource_url"] = tep_info.apply( - lambda row: resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix - if row["is_adi"]==True or row["is_tep"]==True - else np.NaN, + lambda row: ( + resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix + if row["is_adi"] == True or row["is_tep"] == True + else np.NaN + ), axis=1, ) @@ -161,12 +174,22 @@ def transform_gene_info( # fillna doesn't work for creating an empty array, need this function instead gene_info["alias"] = gene_info.apply( - lambda row: row["alias"] - if isinstance(row["alias"], np.ndarray) - else np.ndarray(0, dtype=object), + lambda row: ( + row["alias"] + if isinstance(row["alias"], np.ndarray) + else np.ndarray(0, dtype=object) + ), axis=1, ) + # fillna doesn't work for creating dictionaries, need this function instead + if gene_info["ensembl_info"].hasnans: + gene_info.loc[gene_info["ensembl_info"].isnull(), "ensembl_info"] = { + "ensembl_release": np.NaN, + "ensembl_possible_replacements": [], + "ensembl_permalink": np.NaN, + } + gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 gene_info["is_any_rna_changed_in_ad_brain"] = ( gene_info["adj_p_val"] <= adjusted_p_value_threshold @@ -179,9 +202,11 @@ def transform_gene_info( # create 'total_nominations' field gene_info["total_nominations"] = gene_info.apply( - lambda row: len(row["target_nominations"]) - if isinstance(row["target_nominations"], list) - else np.NaN, + lambda row: ( + len(row["target_nominations"]) + if isinstance(row["target_nominations"], list) + else np.NaN + ), axis=1, ) diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index 2e3642f0..25090252 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -55,30 +55,49 @@ class TestTransformGeneInfo: {"diff_exp_data": "diff_exp_data_type_error.csv"}, param_set_1, TypeError, + "'<=' not supported", ), ( # Bad data type in proteomics core_files, {"proteomics": "proteomics_type_error.csv"}, param_set_1, TypeError, + "'<=' not supported", ), ( # Bad data type in proteomics_tmt core_files, {"proteomics_tmt": "proteomics_tmt_type_error.csv"}, param_set_1, TypeError, + "'<=' not supported", ), ( # Bad data type in proteomics_srm core_files, {"proteomics_srm": "proteomics_srm_type_error.csv"}, param_set_1, TypeError, + "'<=' not supported", ), ( # Missing HGNC in tep_adi_info core_files, - {"tep_adi_info": "tep_adi_info_type_error.csv"}, + {"tep_adi_info": "tep_adi_info_type_error_1.csv"}, param_set_1, TypeError, + "can only concatenate str", + ), + ( # is_adi is a string + core_files, + {"tep_adi_info": "tep_adi_info_type_error_2.csv"}, + param_set_1, + TypeError, + "wrong data type in 'is_adi' column", + ), + ( # is_tep is a string + core_files, + {"tep_adi_info": "tep_adi_info_type_error_3.csv"}, + param_set_1, + TypeError, + "wrong data type in 'is_tep' column", ), ] fail_test_ids = [ @@ -87,6 +106,8 @@ class TestTransformGeneInfo: "Fail with bad data type in proteomics_tmt's cor_pval column", "Fail with bad data type in proteomics_srm's cor_pval column", "Fail with missing hgnc_symbol in tep_adi_info", + "Fail with bad data type in tep_adi_info's is_adi column", + "Fail with bad data type in tep_adi_info's is_tep column", ] def read_input_files_dict(self, input_files_dict): @@ -132,18 +153,18 @@ def test_transform_gene_info_should_pass( pd.testing.assert_frame_equal(output_df, expected_df) @pytest.mark.parametrize( - "input_files_dict, failure_case_files_dict, param_set, error_type", + "input_files_dict, failure_case_files_dict, param_set, error_type, error_match_string", fail_test_data, ids=fail_test_ids, ) def test_transform_gene_info_should_fail( - self, input_files_dict, failure_case_files_dict, param_set, error_type + self, input_files_dict, failure_case_files_dict, param_set, error_type, error_match_string ): # Any files specified in 'failure_case_files_dict' will replace their default "good" files in input_files_dict for key, value in failure_case_files_dict.items(): input_files_dict[key] = value - with pytest.raises(error_type): + with pytest.raises(error_type, match=error_match_string): datasets = self.read_input_files_dict(input_files_dict) gene_info.transform_gene_info( From 3cda173b4fdd464254eaef7d0779c6b82f97076f Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Fri, 9 Feb 2024 16:40:54 -0800 Subject: [PATCH 09/13] Fixed some pre-commit and SonarCloud issues --- src/agoradatatools/etl/transform/gene_info.py | 2 +- tests/transform/test_gene_info.py | 27 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 644ffde3..2ac1c074 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -113,7 +113,7 @@ def transform_gene_info( tep_info["resource_url"] = tep_info.apply( lambda row: ( resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix - if row["is_adi"] == True or row["is_tep"] == True + if row["is_adi"] is True or row["is_tep"] is True else np.NaN ), axis=1, diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index 25090252..96289429 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -1,4 +1,5 @@ -import json +"""Integration test for the gene_info transform.""" + import os import pandas as pd @@ -8,6 +9,10 @@ class TestTransformGeneInfo: + """Tests the gene_info transform. This transform requires 12 different data files, so this test class contains a + util function to read them all in, formatted as transform_gene_info expects. + """ + data_files_path = "tests/test_assets/gene_info" param_set_1 = { "adjusted_p_value_threshold": 0.05, @@ -32,6 +37,7 @@ class TestTransformGeneInfo: "genes_biodomains": "genes_biodomains_good_input.csv", "tep_adi_info": "tep_adi_info_good_input.csv", } + pval_error_match_string = "'<=' not supported" pass_test_data = [ ( # Pass with good data on param set 1 @@ -55,28 +61,28 @@ class TestTransformGeneInfo: {"diff_exp_data": "diff_exp_data_type_error.csv"}, param_set_1, TypeError, - "'<=' not supported", + pval_error_match_string, ), ( # Bad data type in proteomics core_files, {"proteomics": "proteomics_type_error.csv"}, param_set_1, TypeError, - "'<=' not supported", + pval_error_match_string, ), ( # Bad data type in proteomics_tmt core_files, {"proteomics_tmt": "proteomics_tmt_type_error.csv"}, param_set_1, TypeError, - "'<=' not supported", + pval_error_match_string, ), ( # Bad data type in proteomics_srm core_files, {"proteomics_srm": "proteomics_srm_type_error.csv"}, param_set_1, TypeError, - "'<=' not supported", + pval_error_match_string, ), ( # Missing HGNC in tep_adi_info core_files, @@ -110,7 +116,7 @@ class TestTransformGeneInfo: "Fail with bad data type in tep_adi_info's is_tep column", ] - def read_input_files_dict(self, input_files_dict): + def read_input_files_dict(self, input_files_dict: dict) -> dict: """Utility function to read a dictionary of filenames into a dictionary of data frames. Most files for gene_info are in csv format, but the 'gene_metadata' file is in feather format and needs special casing. @@ -138,7 +144,7 @@ def read_input_files_dict(self, input_files_dict): ids=pass_test_ids, ) def test_transform_gene_info_should_pass( - self, input_files_dict, expected_output_file, param_set + self, input_files_dict: dict, expected_output_file: str, param_set: dict ): datasets = self.read_input_files_dict(input_files_dict) @@ -158,7 +164,12 @@ def test_transform_gene_info_should_pass( ids=fail_test_ids, ) def test_transform_gene_info_should_fail( - self, input_files_dict, failure_case_files_dict, param_set, error_type, error_match_string + self, + input_files_dict: dict, + failure_case_files_dict: dict, + param_set: dict, + error_type: BaseException, + error_match_string: str, ): # Any files specified in 'failure_case_files_dict' will replace their default "good" files in input_files_dict for key, value in failure_case_files_dict.items(): From 31a1b1c7befaf0399ba9030a8b8684f2843c8a5b Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Mon, 12 Feb 2024 15:35:49 -0800 Subject: [PATCH 10/13] Simplified gene_info test asset files for easier maintainability, added tests for duplicate Ensembl IDs, fixed issue with null ensembl_info in the gene_info transform --- src/agoradatatools/etl/transform/gene_info.py | 66 ++--- .../input/diff_exp_data_good_input.csv | 30 +- .../input/diff_exp_data_type_error.csv | 4 +- .../input/druggability_good_input.csv | 10 +- .../gene_info/input/eqtl_good_input.csv | 3 +- .../gene_info/input/eqtl_merge_error.csv | 4 + .../input/gene_metadata_good_input.feather | Bin 13202 -> 8474 bytes .../input/gene_metadata_merge_error.feather | Bin 0 -> 6362 bytes .../gene_info/input/igap_good_input.csv | 1 + .../gene_info/input/igap_merge_error.csv | 4 + .../input/median_expression_good_input.csv | 20 +- .../gene_info/input/proteomics_good_input.csv | 20 +- .../input/proteomics_srm_good_input.csv | 12 +- .../input/proteomics_srm_type_error.csv | 4 +- .../input/proteomics_tmt_good_input.csv | 12 +- .../input/proteomics_tmt_type_error.csv | 4 +- .../gene_info/input/proteomics_type_error.csv | 4 +- .../input/target_list_good_input.csv | 10 +- .../output/gene_info_good_output_1.json | 270 ++++++++++-------- .../output/gene_info_good_output_2.json | 270 ++++++++++-------- tests/transform/test_gene_info.py | 124 +++++++- 21 files changed, 527 insertions(+), 345 deletions(-) create mode 100644 tests/test_assets/gene_info/input/eqtl_merge_error.csv create mode 100644 tests/test_assets/gene_info/input/gene_metadata_merge_error.feather create mode 100644 tests/test_assets/gene_info/input/igap_merge_error.csv diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 2ac1c074..ef84bcc3 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -119,22 +119,6 @@ def transform_gene_info( axis=1, ) - ensembl_info = gene_metadata[ - [ - "ensembl_gene_id", - "ensembl_release", - "ensembl_possible_replacements", - "ensembl_permalink", - ] - ] - ensembl_info = nest_fields( - df=ensembl_info, - grouping="ensembl_gene_id", - new_column="ensembl_info", - drop_columns=["ensembl_gene_id"], - nested_field_is_list=False, - ) - # Merge all the datasets gene_info = gene_metadata @@ -148,7 +132,6 @@ def transform_gene_info( druggability, biodomains, tep_info, - ensembl_info, ]: gene_info = pd.merge( left=gene_info, @@ -172,23 +155,42 @@ def transform_gene_info( inplace=True, ) - # fillna doesn't work for creating an empty array, need this function instead - gene_info["alias"] = gene_info.apply( - lambda row: ( - row["alias"] - if isinstance(row["alias"], np.ndarray) - else np.ndarray(0, dtype=object) - ), - axis=1, + # fillna doesn't work for creating an empty array, need this function instead for alias and possible replacements + gene_info["alias"] = gene_info["alias"].apply( + lambda row: row if isinstance(row, np.ndarray) else np.ndarray(0, dtype=object) ) - # fillna doesn't work for creating dictionaries, need this function instead - if gene_info["ensembl_info"].hasnans: - gene_info.loc[gene_info["ensembl_info"].isnull(), "ensembl_info"] = { - "ensembl_release": np.NaN, - "ensembl_possible_replacements": [], - "ensembl_permalink": np.NaN, - } + gene_info["ensembl_possible_replacements"] = gene_info[ + "ensembl_possible_replacements" + ].apply( + lambda row: row if isinstance(row, np.ndarray) else np.ndarray(0, dtype=object) + ) + + # Add ensembl_info as a nested field. This is done after merging all other data sets so it applies to + # all possible Ensembl IDs in all data sets. + ensembl_info = gene_info[ + [ + "ensembl_gene_id", + "ensembl_release", + "ensembl_possible_replacements", + "ensembl_permalink", + ] + ] + ensembl_info = nest_fields( + df=ensembl_info, + grouping="ensembl_gene_id", + new_column="ensembl_info", + drop_columns=["ensembl_gene_id"], + nested_field_is_list=False, + ) + + gene_info = pd.merge( + left=gene_info, + right=ensembl_info, + on="ensembl_gene_id", + how="outer", + validate="one_to_one", + ) gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1 gene_info["is_any_rna_changed_in_ad_brain"] = ( diff --git a/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv index b4bcdbf9..74ef3dfa 100644 --- a/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv +++ b/tests/test_assets/gene_info/input/diff_exp_data_good_input.csv @@ -1,16 +1,16 @@ model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study -Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.38386144170829,-0.610418505477203,-0.157304377939378,1.0964718993346,-3.32039574535246,0.0009658465939259,0.0028700749497231,protein_coding,7,DOWN,CFTR,36.5864869618047,250188,ALL,MAYO -Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.0960797874215311,0.0277369852074512,0.164422589635611,3.66470161931573,2.79371699517244,0.0054146957075407,0.0128646180744981,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,MAYO -,TCX,AD-CONTROL,ENSG00000001629,0.0688865504482834,-0.0336046266228221,0.171377727519389,6.83925693988808,1.3190719562984,0.187765592003833,0.266881156254369,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,MAYO -Diagnosis,,AD-CONTROL,ENSG00000001460,-0.0428271747433806,-0.134111094837718,0.0484567453509564,4.53063964529137,-0.922749954447199,0.356592428630043,0.451948815550479,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,MAYO -Diagnosis,TCX,,ENSG00000000419,-0.0106100593393075,-0.0731334456790523,0.0519133270004373,4.97936316930159,-0.335784216826404,0.737176767177833,0.798556808985985,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,MAYO -Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.0878179824082075,-0.182373477784594,0.0067375129681794,4.32089559628026,-1.82192928105604,0.0688901841106443,0.155209255305467,,20,NONE,DPM1,39.8497192789902,23689,ALL,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001629,0.0804858875345706,0.0073127133165788,0.153659061752562,7.93343801331035,2.16036740054897,0.0310645962980247,0.0847168839428919,protein_coding,,NONE,ANKIB1,36.8283894215301,155410,ALL,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.0342369800592941,-0.114230375069546,0.0457564149509574,3.8133273490137,-0.842377670501157,0.399870699901111,0.554357331277541,protein_coding,1,,STPG1,44.0903630539242,59936,,MSSM -Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.0286960913417103,-0.0945307456366793,0.0371385629532588,4.11960673387962,-0.856820989104849,0.391728896583518,0.546803082969529,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL, -Diagnosis,IFG,AD-CONTROL,ENSG00000001626,0.0258715927543007,-0.1276221394235,0.179365324932101,3.0102152746566,0.330569495815668,0.741062726962072,0.834121516755035,protein_coding,7,NONE,,36.5864869618047,250188,ALL,MSSM -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001626,-0.14255892208891,-0.34294790004963,0.0578300558718095,-0.301663671232398,-1.39443750667908,0.163313064777812,0.296117785659528,protein_coding,7,NONE,CFTR,36.5864869618047,250188,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.0882919008770653,-0.131304264580892,-0.0452795371732381,3.92713408775626,-4.02835272367389,5.78948411033998e-05,0.0006377563132996,protein_coding,20,NONE,DPM1,39.8497192789902,23689,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.0841212438799088,0.0354911663215749,0.132751321438243,6.65482116592828,3.39203333183632,0.0007048884218731,0.0045744978486257,protein_coding,7,NONE,ANKIB1,36.8283894215301,155410,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.0398165705036053,-0.0915728594627508,0.0119397184555402,4.01629214749823,-1.50874762983754,0.131493224324285,0.252297356550177,protein_coding,1,NONE,STPG1,44.0903630539242,59936,ALL,ROSMAP -Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.0281454867160997,-0.0700675860372825,0.013776612605083,2.93709783776462,-1.31934379342664,0.187178523092241,0.327003220657394,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,ROSMAP +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.384,-0.61,-0.157,1.1,-3.32,0.001,0.003,protein_coding,7.0,DOWN,CFTR,36.59,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.096,0.028,0.164,3.66,2.79,0.005,0.013,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,MAYO +,TCX,AD-CONTROL,ENSG00000001629,0.069,-0.034,0.171,6.84,1.32,0.188,0.267,protein_coding,7.0,NONE,ANKIB1,36.83,155410,ALL,MAYO +Diagnosis,,AD-CONTROL,ENSG00000001460,-0.043,-0.134,0.048,4.53,-0.92,0.357,0.452,protein_coding,1.0,NONE,,44.09,59936,ALL,MAYO +Diagnosis,TCX,,ENSG00000000419,-0.011,-0.073,0.052,4.98,-0.34,0.737,0.799,protein_coding,20.0,NONE,DPM1,39.85,23689,ALL,MAYO +Diagnosis,IFG,AD-CONTROL,ENSG00000000419,-0.088,-0.182,0.007,4.32,-1.82,0.069,0.155,,20.0,NONE,DPM1,39.85,23689,ALL,MSSM +Diagnosis,IFG,,ENSG00000001629,0.08,0.007,0.154,7.93,2.16,0.031,0.085,protein_coding,,NONE,ANKIB1,36.83,155410,ALL,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001460,-0.034,-0.114,0.046,3.81,-0.84,0.4,0.554,protein_coding,1.0,,STPG1,44.09,59936,,MSSM +Diagnosis,IFG,AD-CONTROL,ENSG00000001631,-0.029,-0.095,0.037,4.12,-0.86,0.392,0.547,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL, +Diagnosis,IFG,AD-CONTROL,ENSG00000001626,0.026,-0.128,0.179,3.01,0.33,0.741,0.834,protein_coding,7.0,NONE,,36.59,250188,ALL,MSSM +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001626,-0.143,-0.343,0.058,-0.3,-1.39,0.163,0.296,protein_coding,7.0,NONE,CFTR,36.59,250188,ALL, +Diagnosis,DLPFC,AD-CONTROL,ENSG00000000419,-0.088,-0.131,-0.045,3.93,-4.03,0.0,0.001,protein_coding,20.0,NONE,DPM1,39.85,23689,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001629,0.084,0.035,0.133,6.65,3.39,0.001,0.005,protein_coding,7.0,NONE,ANKIB1,36.83,155410,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001460,-0.04,-0.092,0.012,4.02,-1.51,0.131,0.252,protein_coding,1.0,NONE,STPG1,44.09,59936,ALL,ROSMAP +Diagnosis,DLPFC,AD-CONTROL,ENSG00000001631,-0.028,-0.07,0.014,2.94,-1.32,0.187,0.327,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,ROSMAP diff --git a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv index f2c78adb..3a809fab 100644 --- a/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv +++ b/tests/test_assets/gene_info/input/diff_exp_data_type_error.csv @@ -1,3 +1,3 @@ model,tissue,comparison,ensembl_gene_id,logfc,ci_l,ci_r,aveexpr,t,p_value,adj_p_val,gene_biotype,chromosome_name,direction,hgnc_symbol,percentage_gc_content,gene_length,sex,study -Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.38386144170829,-0.610418505477203,-0.157304377939378,1.0964718993346,-3.32039574535246,0.0009658465939259,string_value,protein_coding,7,DOWN,CFTR,36.5864869618047,250188,ALL,MAYO -Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.0960797874215311,0.0277369852074512,0.164422589635611,3.66470161931573,2.79371699517244,0.0054146957075407,0.0128646180744981,protein_coding,7,NONE,KRIT1,36.4761218695707,47198,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001626,-0.384,-0.61,-0.157,1.1,-3.32,0.001,string_value,protein_coding,7.0,DOWN,CFTR,36.59,250188,ALL,MAYO +Diagnosis,TCX,AD-CONTROL,ENSG00000001631,0.096,0.028,0.164,3.66,2.79,0.005,0.013,protein_coding,7.0,NONE,KRIT1,36.48,47198,ALL,MAYO diff --git a/tests/test_assets/gene_info/input/druggability_good_input.csv b/tests/test_assets/gene_info/input/druggability_good_input.csv index 5c9269d7..28fe765e 100644 --- a/tests/test_assets/gene_info/input/druggability_good_input.csv +++ b/tests/test_assets/gene_info/input/druggability_good_input.csv @@ -1,6 +1,6 @@ ensembl_gene_id,sm_druggability_bucket,safety_bucket,feasibility_bucket,abability_bucket,new_modality_bucket,tissue_engagement_bucket,pharos_class,classification,safety_bucket_definition,feasibility_bucket_definition,abability_bucket_definition,new_modality_bucket_definition,tissue_engagement_bucket_definition -ENSG00000000005,7,4,4,3,4,4,Tbio,,"More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","Medium target qualification, no critical safety issues. Identify and perform steps needed for additional TQ and move to category 1, 2 or 3 if successful, or park in category 5 or 6 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues. -ENSG00000001036,1,3,5,1,4,2,Tchem,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",,Medium or high target qualification. Interesting target but has gaps in TQ or tractability/druggability which will make progress difficult or slow. Park and watch for developments e.g. until additional TQ or tools emerge. ,Secreted protein. Highly accessible to antibody-based therapies.,Not specified suitable for degradation/inhibition by user.,"The target gene is not ""tissue enriched""/""tissue enhanced"" in any tissue." -ENSG00000000460,13,3,5,3,4,4,Tdark,Unknown: There is no information on ligands or structure in any of the categories above. ,"Two or fewer of: high off-target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.",,,Not specified suitable for degradation/inhibition by user.,The target gene is enriched/enhanced in tissues marked by the user as being associated with a low risk of off-target engagement issues. -ENSG00000000971,3,4,3,1,4,5,Tbio,"Targetable by structure: Structurally druggable protein, based on the presence of a druggable pocket in the protein (DrugEBIlity/CanSAR).","More than two of: high off target gene expression, cancer driver, essential gene, associated deleterious genetic disorder, HPO phenotype associated gene, or black box warning on clinically used drug.","High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.",Secreted protein. Highly accessible to antibody-based therapies.,,The target gene is enriched/enhanced in tissues marked by the user as being associated with a high risk of off-target engagement issues. -ENSG00000001084,1,5,3,3,4,2,,"Small molecule druggable: Protein with a small molecule ligand identified from ChEMBL, meeting TCRD activity criteria.",Clinical data with evidence of intolerable safety profile/adverse drug reactions in the desired modality and with target engagement. Drug for target withdrawn on those grounds.,"High target qualification but low feasibility – Tier C for drug development. Attempt to address gaps and re-evaluate if additional feasibility tools found. Either move to category 1 or 2 if successful, or park in category 5 if not.","Cell membrane-bound proteins. Highly accessible to antibody-based therapies, but potentially less so than secreted proteins or ECM components.",Not specified suitable for degradation/inhibition by user., +ENSG00000000005,7,4,4,3,4,4,Tbio,,Safety definition 1,Feasibility definition 1,Abability definition 1,Modaility definition 1,Tissue engagement definition 1 +ENSG00000001036,1,3,5,1,4,2,Tchem,Classification 2,,Feasibility definition 2,Abability definition 2,Modaility definition 2,Tissue engagement definition 2 +ENSG00000000460,13,3,5,3,4,4,Tdark,Classification 3,Safety definition 3,,,Modaility definition 3,Tissue engagement definition 3 +ENSG00000000971,3,4,3,1,4,5,Tbio,Classification 4,Safety definition 4,Feasibility definition 4,Abability definition 4,,Tissue engagement definition 4 +ENSG00000001084,1,5,3,3,4,2,,Classification 5,Safety definition 5,Feasibility definition 5,Abability definition 5,Modaility definition 5, diff --git a/tests/test_assets/gene_info/input/eqtl_good_input.csv b/tests/test_assets/gene_info/input/eqtl_good_input.csv index 9e49ca8f..a8b17520 100644 --- a/tests/test_assets/gene_info/input/eqtl_good_input.csv +++ b/tests/test_assets/gene_info/input/eqtl_good_input.csv @@ -3,4 +3,5 @@ ENSG00000000419,True ENSG00000000971,True ENSG00000001460,True ENSG00000001626,True -ENSG00000161149,False +ENSG00000161149, +ENSG00000001517,False diff --git a/tests/test_assets/gene_info/input/eqtl_merge_error.csv b/tests/test_assets/gene_info/input/eqtl_merge_error.csv new file mode 100644 index 00000000..38984052 --- /dev/null +++ b/tests/test_assets/gene_info/input/eqtl_merge_error.csv @@ -0,0 +1,4 @@ +ensembl_gene_id,is_eqtl +ENSG00000000419,True +ENSG00000000971,True +ENSG00000000419,True diff --git a/tests/test_assets/gene_info/input/gene_metadata_good_input.feather b/tests/test_assets/gene_info/input/gene_metadata_good_input.feather index b7f8a1ce519148fb0e3ee8a406d64e992b7560d8..291717bf5e2c53268880ffcd3289a93fafdfe9d7 100644 GIT binary patch delta 2595 zcmd^>-)~c87{{NMcI(!Sc61ZnkiAw$Hfcx8#(E$GHbG!ASlxh)grqyPhxI_u4^L0U zdSQohWr*24UKryAa-#IZeGd-6ks-vu}bnf;iT2>DBwgp;<0z~x3iu^<@p3A9vL3uS- zQeII?xl}F}zx{bj&^fd_yt3B4<g|*;f^Ep?20hO0P^|V0EU)wm zgNfP@^|Gk1i<)8FymVJ-gj`Ep6@g7JCeaUsx z4WV}n{jJbKlBe^+AqBmNTYLIX(LYIixloRX5ihm6+8e+Pp)a~_3W@113T+DQa+LVNv_=NJ+R^Ik%e&Ir9M;)jJ^6WSZVL!osMFY-Oei+I6e*p;$d5qS>N zbD-`3Jwl7;Hncb6Uc6b06~!*|h5P3pm(|>=*SVN(Pj2+uobN2f`kt=WBO8e{FU&yv zZ8=ll@0yM}u9@VPWAsh1UImX%7u zxUA|j+(##motjEB+$oAB=WSJAWY$)?zj$nhnU-N2Nkh*Yi;Au!7_#M(S(KAg$%~b| zQC4y1C~Obhi=~P!YqG5*^VPDgsJg;RaO;(PNs%pP*DP5t=PlK=RRjLAsaP7bWwpez z$C8$sFER`6Q*2T|m#unS(Sg>zAuLXuZ`^OhvXCE=)FwynNL* z@x5;_omoISySOYF*A8O%|7<5PKGX2e)YZ`&? zc$6Q+Euas3YUd6ZNx&O81df9la31ixg`d9Hz%}qT;QQ|;@8~8Bw|JJ=Tra``-6NP+>f;vq>;?Nk0~-onJhQ)IKrw0&~O6k9o z`8%=!p6f(L z=t#)sO%pN4)od5hNb%ju;Waw{uJd>Gl5NPCukie0?Ru%+cUCiF=w|Btcc$gEY)W&^ zcMCb*UQ<46B(2nZ7xI~0;c$jEeSw~IdDk+!tEDtoL-C87^el=S*);!l)8t^dr|FUf zoq1<3P}tEk)8_eUJ*%H@qdSv)t%_XG)-~6%i_FP1rfp6k;7z%L zX4}?I-jyA??HE>;r>dQ8k+x_PSVVd6FU)LB0otC(^0V{b1nLm)1j_QH74OFbhrfHb z{MfuW> zqyLR^t@GuIV>k|Z)I67%IvCW7l@!+?9kJ`pW<2 z1FAaY6A87=hX6IAR<2nDiZzoc5udZ`R+I|h5kN`^HI&OVg7$bBUPrkeUb@`Qf=_w! zJHqx-ELVE|bvYOG%fr(72Pvia1*Q0&QhYhu=HVWUfroPW{iS%hd~a#M>G@)GdnsNn z|4*fOxj#LeFc8}3A2eHvm!gN`<@)v?m0ym(S^8iQokFy5{6K2zo!&0_xJWv zysn8W@`xiZE*I3euWRbxn|3x>g6NxjTaRnvR|zty--HX;_VP zbR)T^6F+MAxwKS0m)5AA-Dgru=bA7r?OIMP-75cy?!wp6pHh$7-SH50GS#XWyM0|^?n8tQp$vJvHWx?X% zMLPzEhsH)!#mKt&HtE8YP;5P&H@Og3tQka-R%RN7UC3+&#)^_LT+1<>!J%;_XIrin zwqS#BZriF%*hytZ%NS-saSJv|Wp!n~A`!s!Q1|3ANwwz2g$_yP`=H#H_D6VFhN@6H% zV`|Xd9P|$*X=Ra|<)cx@h7++6n@w36q#RG<9V4*`^?2H68`G9!WpriQ_%(H4L^~-4 z6U(}mZn0b=g^9utu3>$cruG0$!_3kh_ZoJPb;63$umZM`#r(wwe@weGu&b~D`fmC} zDxY+vb?XOUOqO&V<1yj1zY3xildAGz+IwUe^K?-r%?0#&=*@B>io6@W7I_$F1f0iK0kuFqzy&ksdn*CX zC{FdSR&CmF)$kA6FE5^8#{BjPmOO$bg8lVX(-lT9^%4r z|Al>uY1qxbs|L8_I0fLWlRFY~L4|o9hXW%EmlfW~;^8oMi ze*k>04+4J%M27wXA)pp$1Xcnk0?j}ehyq=}8sKm!M@s9$%gt5yjSt8BJLBs|)se}8 z(a}WzM7)1!cpwpr4}?_@z0#l#jtnP82ZrM@HLhCrjH*UsIm@)vWOycRPmIT-xvs7N zC30Oo;XyCec6q6$r-9}?$*^N5qp1zi$XK+kYiJ}s0s{BaoERU9PsIBM!+m3e;VrTC zu|%|*YGOLgHAW-qXe_ZY+P|T*JGwF2(H5N;iFd7zb&VxrBON1}qlP(?_Ct=QHgvA; z>Ra6w9UVVsV)LX(mzDR5Op);L*#Vjhof+$k$5)SzCsy;_(~g(ahSlAXcvqV`IW``y zr~N3<(G?%$tY!qGonv9Ovww7?e>9FghK)8N(;mFUM&t3(za$s`_!ssYmy5sTi!dNy z^GHNXJRni^1#p}&=--$Hm$q51K|&^gpD?>Y27j0!h>|bqucq6V(L3|)A>5h`5i8J zGibudWb&BFjE=7%W3Q!Gt1+DrdIiNXF4Ds($W^#avu;u0&~0qpM~GRB{hW4V_OdA> zrFAoUg!N+N%4JNtm+j4HN$3%_lG1yq#{Lz}EpNrbSot(&484SA;X>eNw=1)nqZD+^ zoTXsuQhEln?b>h9+{!j(EBNXTNaK_;T~NmLnF;+u=oxwr4%o<=ZEgKUjmRo%M4#pi zXHK@D*)$_7oCuF<*%k%E-k}N9DkM@?*0MuPXrcWD6XXG1BoE55E)otv`^dwOgZV&~tZOVw2+}=#`g`7*OdDGp(Uad&!GZ>SxgKJvzQzWi(+D7Q0AIt9)vLFpU$0=n7oj68XA=}X}%wsowhuCk#n)D|{_A7QZs*1KL zqj^^qSRbo*Tcov`tv86Th)_?k{|soDSWd^XQ~L9a9THWjqeCP_#IFtOZA#2UyIE}y z+m(az3XSY;c6kO9X5>u$5gO;JoqMym4*s)G3DQkt)()(!+O@3>rua)j5Zpw3P~1{f zBI~gNGgjVK)(^xc=QX`Foj5t$M{=!j^I;X%*+BGN))04?sIxtM`&|T0t+gO^4vdc` z1nxZetT}IxS~1@wSpm=O|+_ zb{2M}?rfD)nF2PkyGEczY%-^r8Kai4eKiGN*-Y5N)QR=c1475d`i@3g}`^VgeliH(rNt(NlZzc1!>{vuc$wb$@fVtDXvWAU!w<G6u)%^ia6*<m zicNaz67L`=ru80yCaBBNGN-XbEBMHchpAW3Dn`9`9hms_9uHQ_yxy~-=Mf>$IDRI} z#jOnJE@OE^u@;Z;HEFzgJRWN~*Lz{o%!3u$I;%d_&sv37WOpaH9k?HMY|)-@;FLRs z(YTaSnkVApEjI|XSg822FX8A(TfbC}!GyAi?VC0&D+TtYYZ>TCH+lD9H-ZJAve$eO ztiZDZ3sbHAV-lb6G$bFVdhKyy*9!?uUef)(z`no|Mi%DQi{9QDH2K%QTyj>+8cs&` zO@UL}*7PjZ8C62ad6Wg~Jc8#&7Kr#Cn8l!qhF&;t|H-$~DfCtR!4EFDe$zw?WJ?P2 z&$e*5q|6p_)^~}u(NtE?A?-S$p%9RLOFSzL>v0badRFKg1U8j2W?)S|!=yJQ--r-F zANI?NZa?h}ZuLl!jY+poZd5l4UO}MG_+FDW$gy`=c(9t5RbrDGzI!lqM^Pi;Xyl|0 znGP`U`F1ZoEV3{9J*SayA3N1>*Gj=FWssmZhYmtan@@R8h_1Oo7S>DJybWthC(P=r zC{&Ou#gCKlE$UTUyTft?{oDuT5_01L%gDZ3NnfE}u-N>q-Z=_GWGQ1|Fk-i0rp;4H z(J}{Rag5j}d|Y9@g-vBF10wb(e$np}J+kzrO7V5yD#Q5~VIP-c>mjp~*7UV(CyYT8 z2H(#l)lz2P_N`27j|s#o)ue9sdxKATUlO`4mNgwIhhEQR)h+MrTE!snaCn;^}hOc{_V zIbZPeEJ&NA15Vya>Nyw6k_oM3jo@SoU^{l|U!;|q{hDV*+rI_`@ij3Nc}RVZ)5Co zFZ)deeP8b7&V%!+>3;=d@Mfe=^-aX%vDoC4j~?{V6D*Xo8GY8H!uN5Hj}BBaL2lCB zRSf%9hq^Y=Ei}ZwUMc>oPerT!LMU|?F;E&5jnsogQ&iJ8E7Zj|3&Jx#y1r7$rawy3 zZ>aLi6zw029al-er8>nIlB$B6w3!(r`>Z5?G$5@a_IV%ek=cHdCM6-x*bn^u87-xE zR@KFBpvzjZuwbap4J4jjS*yJ&R6i}ybG`&5RmynY7ixV|sIebnA^jKC6s)=mq7Da5 zZulA&(!Nm5R*LjR#vHG7LMQ?y>J0zxRz)|UY}hxF_&bU6!KkgJI!UqYgDmD^&rLCU zyh>=OI;%|?%-g4tZyH1Mi>p0CPVk=Q7hd$xC?*>QrH7uDp)s*2C}rnH=ubFGJ@ zDReQZJOAVhrP3@ICf3LOOzB#>L!?~+w!AXQojtntV{x(JQq?xv-KHe4(T(KaNy3jq z(j8K>k^L}<(Mq%EILe>y>5ZADaju8%W)W!8&q}#*-GTa$yt(rFSk8u;5shxUfD%yo zrp;T0P|tw$B23<*hP$-kGH+*f`>~ph+`M^}T5p;*oaU}oVGz=L!QMmy2x@v`7aVu-z5 zIRFotHj6_I7Rlx9+k7$Z47)?1Z56_!-kRw|t$3E`U&1!&yI??^Q_-aFy3R{uBwZ;4 z|41jWdjl+8CvEl#->>#GhJLlcA2vKZd{qTK&*T>Z!bfW9L_s`RraR=7n)`z6`=fBR zwhLawblymPTnwc=OiL?SJ#T+X4y88MD^~8SRrIn?x|%$9)QDdSk|FuDP-H^STwF(i zMYN()_*i|yN}Ja76Afo65VW~Jcp9~z)l5ulj*CZ|1FvD8! zYhb^t5O##9s*WDSw`WLGF)D!Z4A3nbe>cxUeIeVWqd6?j=#a9R5Ei z4^qFXmTq8Df;>K}yB)I-{Mi7qy?s!tVvCj>ffV=<7fCQ>-GCJITlPH40aH`@rbp ze?`LURl+yDCm7D+M&T`QBw0vmarKuUm`rl*d|&zW8dBoa)+cu zGMTsSyICl6b`?!CT3a93Ls$3frmGEhMSA8!(k-P4_j%~U_pK6W998eBLj z2x7KExLS~70#z~bA8N;ozVZI0!B5o+ghREpei4kc(^okwOQ$UnudZIRdRyC{y<)E~ zk)Ou2Z>XN)uI3B!Z!Gd`s+Q_SX{(35EqbCPZj$NvYPz*rI+?29_WDdY_zl0rMhJuk zN3=Oj-fNdKh3Ga4e99C0&`lCsMg#CV6+T4lUbgf@`zW}PYO23Si*j>bc`HeCB&@An zzAH-1&*%@t{%Q%ksJ`;=S@0!aUGTe75MDYK49@BbclF#NQH59dQ&5ycfmRYHSdA0( z?!k0hFa*>3U5(OX4Z9KG zmxc7l>tYtj&PR%oI@PPD6zh|LvrHprq`n9ro+s!o6T~m0Q!EB)pF6vnwM!v=o1e8T zWA{p46Mon#@qQ96ucPKbBxCJ}9xu4nIRD>M`ylQ!;Yfq~Qu4V=LNQazT3@F9CM;Pv zYfLL`<(;%xtG6*>tCytUK{8Fa5o{gSb$&ap7wih$kMFfs1)iF;(R$w^W;MW-Sb)-u ziFO96*=K$14M}`HC>}5Mf(G>ekSc%4Xpx`VSkq4H^B(qCP=2~0c#7BW9hSJw2Rp35 zUB#6*3(^^qGOAq?z?mZ3%iu~DQn-=ERnI;zN zY*X96y7V0YNY^cc|NERR{}`k7JGW(>bHoskJjaq;l)L&6UOzB@$u693}xc@5%jp((6hRDYq;45wv6JwvE% zxeIFBGP=293I;WNMh^zX_Qm91a-tfZrP^@y(sn(y)lcjOlJg{X*DSi=0Y$oi36Im% zcx)nU>yIk(-_t%tjVaCJT276%T(F1*8hS6x8}^|kK^2}<9ADi~L5r73iYVSj!cS^^ z+bX0_P>pj7T-6OiNMYYtY{B`JH}!T#>=HS6`Elg!rYW4z*wA7;^p>Z_{kkmvT$KDi zVY8Q>WC`fc`t(6%DyhNN%YIhu!jj#(l0_QDbzbiSA@7?@SSVN-5j`vFf_38Jy+=Gh zSS1qq55j_Tfj2R&hr?v)t&~5G=-==p{+$Od$}T@%;GHaGNXuGRdvJHnH2 z`phG%)y>qqk)J;!v0d%Hm&I17_p~8|8AFJ z>u~y9C&l<7v?f@2lk1#6l-oBIkE3OFsg|>mgw`U7g$L98o(i7@xT-*&bvVG`w@*Sv zQso4~*SQ(M1cGd!q{GwBL#VWX3-7PY=TsEG{otefLs(w{< zZ)UGt`FOP|#4gHCNr+)V<(N1wj)(zK!WT+kf65;M*{}JdN9Y@3RN|NLmt;^Q&|S%X z8xUg4u)U_dV>W!J<$95EwtU?%&4lO3P3f4ErfJ+5D;o(S5lIEyGCa$b=;E~;2Z)=t zbeiT~USN4mD>CDDKtQAA*$uxb4WPZ28?;k_zjjMDV&c0pwwhLK0lsBO4{+OCr@mKL zJ{o_|*xr8Z%w8`5j-QtdFlSzwjA4vibhj*&YELK@pZk`Zcw*>u-6J}8MR#33fo1i z89~nOzuVmfi@bkB=}0S5hot-d%>e!z4+1}m>@`Q4p$r_WAzkUkUFM%1?g`I%r`AQf zLEL`gRY^_C(8cn5w~Oo36_WGh=~k*+yz3G*i$~t};?uR_hD_O+*r#@f3U#5hVn0;j z(6QG-+LMTeLE8#L|1(myDZ|M2J>phtmC4GKfuO|ECM&E=RcgjIg@2LE2W1l>E*?UT zKz)E@aUXRBbrkgxWM7~zqEasT_4nOfWRydchsHDyjd%{VkTbxeoKq5mNzOsX^WfF~ zP&x)k$0=0OONDe477n~o;5U*!Loa}?p;8~Xr+j!7ZQ8Hfu7%t||NDuKKC(N4%66%> zW9gz1cY*mS!72aB;4$t5zS)jGe&PpU`;nXBjlyU34g#+Og?I?uSb|eyctz`_G0>Qh zhJ^}cXh(p%)gExi|$3`;(nJotq(}e@TI=-O}M?E{HRP6-sV6LrSwa+%IU( zeguZg5IDooKzmHxZ?Jy}L-Uz|rFLH9j$PWfG;dV56S}>Y!!phIj+QU$b|;5tn(=DW z0z-2{h0a*EFwOY7ma{*d9G+>$-^JWk=Hq^_YHx8QFy*o~BvfsHV3x{$38xG_M=MXfNboPfZVM``q->&(oim zb3bnL@;sf~^KpKRrIAn1e-5%kUFjInHXbK}3V&!EEz8w~%gYxg`^3of330ksJ##|T zCsC#qzK7o95Wb1}fdXYdv|jK#vGjb`Z?=$eE0s#qckISS+Zhiw{3zJ4Vp(> Date: Wed, 14 Feb 2024 11:04:12 -0800 Subject: [PATCH 11/13] Fixed punctuation in documentation in test_gene_info.py Co-authored-by: Brad Macdonald <52762200+BWMac@users.noreply.github.com> --- tests/transform/test_gene_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index 4b422612..a171211f 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -233,11 +233,11 @@ def read_input_files_dict(self, input_files_dict: dict) -> dict: gene_info are in csv format, but the 'gene_metadata' file is in feather format and needs special casing. Args: - input_files_dict - a dictionary where keys are the names of the datasets, as expected by + input_files_dict: a dictionary where keys are the names of the datasets, as expected by transform_gene_info, and values are the filenames to load Returns: - datasets - a dictionary where the keys are the names of the datasets, as expected by + datasets: a dictionary where the keys are the names of the datasets, as expected by transform_gene_info, and the values are data frames """ datasets = {} From 9f2976f7e71b32a2cdde61d0a91a2d6ea1f38012 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Wed, 14 Feb 2024 12:26:32 -0800 Subject: [PATCH 12/13] Updated TypeError messages for tep_info in the gene_info transform --- src/agoradatatools/etl/transform/gene_info.py | 8 ++++++-- tests/transform/test_gene_info.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index ef84bcc3..5e5fccad 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -98,9 +98,13 @@ def transform_gene_info( # check below. tep_info = tep_info.fillna({"is_adi": False, "is_tep": False}) if tep_info["is_adi"].dtype != bool: - raise TypeError("wrong data type in 'is_adi' column") + raise TypeError( + f"'is_adi' column must be 'bool', current type is {tep_info['is_adi'].dtype}" + ) if tep_info["is_tep"].dtype != bool: - raise TypeError("wrong data type in 'is_tep' column") + raise TypeError( + f"'is_tep' column must be 'bool', current type is {tep_info['is_tep'].dtype}" + ) # For genes with either is_adi or is_tep set to True, create a resource URL that opens # the portal page to the specific gene. This must be done using the hgnc_symbol from the diff --git a/tests/transform/test_gene_info.py b/tests/transform/test_gene_info.py index a171211f..c4449eb9 100644 --- a/tests/transform/test_gene_info.py +++ b/tests/transform/test_gene_info.py @@ -205,14 +205,14 @@ class TestTransformGeneInfo: {"tep_adi_info": "tep_adi_info_type_error_2.csv"}, param_set_1, TypeError, - "wrong data type in 'is_adi' column", + "'is_adi' column must be 'bool'", ), ( # is_tep is a string core_files, {"tep_adi_info": "tep_adi_info_type_error_3.csv"}, param_set_1, TypeError, - "wrong data type in 'is_tep' column", + "'is_tep' column must be 'bool'", ), ] fail_test_ids = [ From 7a2fe11d780dd91ba85302b8bb3d1dc33760a779 Mon Sep 17 00:00:00 2001 From: Jaclyn Beck Date: Thu, 22 Feb 2024 13:02:56 -0800 Subject: [PATCH 13/13] Changed resource url variables in gene_info to constants --- src/agoradatatools/etl/transform/gene_info.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 5e5fccad..55ede085 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -5,8 +5,8 @@ def transform_gene_info( - datasets: dict, adjusted_p_value_threshold, protein_level_threshold -): + datasets: dict, adjusted_p_value_threshold: float, protein_level_threshold: float +) -> pd.DataFrame: """ This function will perform transformations and incrementally create a dataset called gene_info. Each dataset will be left_joined onto gene_info, starting with gene_metadata. @@ -111,12 +111,17 @@ def transform_gene_info( # tep_info file and not the symbol in gene_info, because there are some mismatches # between the two and the hgnc_symbol from tep_info is the correct one to use here. # resource_url should be NA if both is_adi and is_tep are false. - resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" - resource_url_suffix = "%22%5D%7D%5D%7D" + RESOURCE_URL_PREFIX = ( + "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22" + + "select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22" + + "%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table." + + "FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22" + ) + RESOURCE_URL_SUFFIX = "%22%5D%7D%5D%7D" tep_info["resource_url"] = tep_info.apply( lambda row: ( - resource_url_prefix + row["hgnc_symbol"] + resource_url_suffix + RESOURCE_URL_PREFIX + row["hgnc_symbol"] + RESOURCE_URL_SUFFIX if row["is_adi"] is True or row["is_tep"] is True else np.NaN ),