diff --git a/config.yaml b/config.yaml index a4f5a8be..265e0d6f 100644 --- a/config.yaml +++ b/config.yaml @@ -20,10 +20,10 @@ sources: - agora_proteomics: agora_proteomics_files: &agora_proteomics_files - name: proteomics - id: syn18689335.3 + id: syn18689335.4 format: csv agora_proteomics_provenance: &agora_proteomics_provenance - - syn18689335.3 + - syn18689335.4 - agora_proteomics_tmt: agora_proteomics_tmt_files: &agora_proteomics_tmt_files - name: proteomics_tmt @@ -89,6 +89,7 @@ datasets: - proteomics: files: *agora_proteomics_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_provenance column_rename: genename: hgnc_symbol @@ -99,6 +100,7 @@ datasets: - proteomics_tmt: files: *agora_proteomics_tmt_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_tmt_provenance column_rename: genename: hgnc_symbol @@ -109,6 +111,7 @@ datasets: - proteomics_srm: files: *agora_proteomics_srm_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_srm_provenance column_rename: genename: hgnc_symbol @@ -290,6 +293,7 @@ datasets: custom_transformations: 1 provenance: *rna_diff_expr_data_provenance destination: *dest + gx_enabled: true - proteomics_distribution_data: files: diff --git a/gx_suite_definitions/rna_distributinon_data.ipynb b/gx_suite_definitions/rna_distributinon_data.ipynb new file mode 100644 index 00000000..1e642889 --- /dev/null +++ b/gx_suite_definitions/rna_distributinon_data.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import great_expectations as gx\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for RNA Distribution Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rna_distribution_data_file = syn.get(\"syn28094691\").path\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator = context.sources.pandas_default.read_json(\n", + " rna_distribution_data_file\n", + ")\n", + "validator.expectation_suite_name = \"rna_distribution_data\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# model\n", + "validator.expect_column_values_to_be_of_type(\"model\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"model\")\n", + "validator.expect_column_values_to_be_in_set(\"model\", [\"AD Diagnosis (males and females)\", \"AD Diagnosis x AOD (males and females)\",\"AD Diagnosis x Sex (females only)\", \"AD Diagnosis x Sex (males only)\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tissue\n", + "validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"tissue\")\n", + "validator.expect_column_values_to_be_in_set(\"tissue\", [\"CBE\", \"DLPFC\", \"FP\", \"IFG\", \"PHG\", \"STG\", \"TCX\", \"ACC\", \"PCC\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# min\n", + "validator.expect_column_values_to_be_of_type(\"min\", \"float\")\n", + "validator.expect_column_values_to_not_be_null(\"min\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# max\n", + "validator.expect_column_values_to_be_of_type(\"max\", \"float\")\n", + "validator.expect_column_values_to_not_be_null(\"max\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# median\n", + "validator.expect_column_values_to_be_of_type(\"median\", \"float\")\n", + "validator.expect_column_values_to_not_be_null(\"median\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# first_quartile\n", + "validator.expect_column_values_to_be_of_type(\"first_quartile\", \"float\")\n", + "validator.expect_column_values_to_not_be_null(\"first_quartile\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# third_quartile\n", + "validator.expect_column_values_to_be_of_type(\"third_quartile\", \"float\")\n", + "validator.expect_column_values_to_not_be_null(\"third_quartile\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# multi-field\n", + "validator.expect_column_pair_values_a_to_be_greater_than_b(\"max\", \"third_quartile\")\n", + "validator.expect_column_pair_values_a_to_be_greater_than_b(\"third_quartile\", \"median\")\n", + "validator.expect_column_pair_values_a_to_be_greater_than_b(\"median\", \"first_quartile\")\n", + "validator.expect_column_pair_values_a_to_be_greater_than_b(\"first_quartile\", \"min\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Expectation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py index b281c8b2..ef000e81 100644 --- a/src/agoradatatools/etl/transform/__init__.py +++ b/src/agoradatatools/etl/transform/__init__.py @@ -15,6 +15,7 @@ transform_rnaseq_differential_expression, ) from agoradatatools.etl.transform.team_info import transform_team_info +from agoradatatools.etl.transform.proteomics import transform_proteomics __all__ = [ "transform_distribution_data", @@ -26,4 +27,5 @@ "transform_rna_distribution_data", "transform_rnaseq_differential_expression", "transform_team_info", + "transform_proteomics", ] diff --git a/src/agoradatatools/etl/transform/gene_info.py b/src/agoradatatools/etl/transform/gene_info.py index 55ede085..a812a89e 100644 --- a/src/agoradatatools/etl/transform/gene_info.py +++ b/src/agoradatatools/etl/transform/gene_info.py @@ -2,6 +2,7 @@ import pandas as pd from agoradatatools.etl.utils import nest_fields +from agoradatatools.etl import transform def transform_gene_info( @@ -14,10 +15,10 @@ def transform_gene_info( gene_metadata = datasets["gene_metadata"] igap = datasets["igap"] eqtl = datasets["eqtl"] - proteomics = datasets["proteomics"] + proteomics = transform.transform_proteomics(df=datasets["proteomics"]) rna_change = datasets["diff_exp_data"] - proteomics_tmt = datasets["proteomics_tmt"] - proteomics_srm = datasets["proteomics_srm"] + proteomics_tmt = transform.transform_proteomics(df=datasets["proteomics_tmt"]) + proteomics_srm = transform.transform_proteomics(df=datasets["proteomics_srm"]) target_list = datasets["target_list"] median_expression = datasets["median_expression"] druggability = datasets["druggability"] diff --git a/src/agoradatatools/etl/transform/proteomics.py b/src/agoradatatools/etl/transform/proteomics.py new file mode 100644 index 00000000..cb6d1e5b --- /dev/null +++ b/src/agoradatatools/etl/transform/proteomics.py @@ -0,0 +1,25 @@ +"""Function for transforming proteomics data. This function is called on all three proteomics +data sets, although currently it only affects the LFQ data set as it is the only one with "CON__" +entries. +""" + +import pandas as pd + + +def transform_proteomics(df: pd.DataFrame) -> pd.DataFrame: + """Filters out rows that have "CON__" in their uniqid. This label indicates that the protein + is a known contaminant and should be removed from the final data set. Rows with an NA uniqid + are also removed. + + Args: + df (pd.DataFrame]): pandas DataFrame containing proteomics data. Must contain a column + called "uniqid". + + Returns: + pd.DataFrame: a DataFrame that is identical to the input DataFrame but with rows containing + "CON__" in the uniqid removed. + """ + # Using "na=True" causes rows with NA uniqids to be set to True so they get removed + remove_rows = df["uniqid"].str.contains("CON__", na=True) + df = df.drop(df.index[remove_rows]) + return df diff --git a/src/agoradatatools/etl/transform/proteomics_distribution.py b/src/agoradatatools/etl/transform/proteomics_distribution.py index c05c08fe..6c4f3d85 100644 --- a/src/agoradatatools/etl/transform/proteomics_distribution.py +++ b/src/agoradatatools/etl/transform/proteomics_distribution.py @@ -1,6 +1,6 @@ import pandas as pd -from agoradatatools.etl import utils +from agoradatatools.etl import utils, transform def transform_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: @@ -18,6 +18,9 @@ def transform_proteomics_distribution_data(datasets: dict) -> pd.DataFrame: """ transformed = [] for name, dataset in datasets.items(): + # Remove contaminant ("CON__") entries and rows with NA uniqids before calculating distribution + dataset = transform.transform_proteomics(df=dataset) + df = utils.calculate_distribution( df=dataset, grouping="tissue", distribution_column="log2_fc" ) diff --git a/src/agoradatatools/great_expectations/gx/expectations/rna_distribution_data.json b/src/agoradatatools/great_expectations/gx/expectations/rna_distribution_data.json new file mode 100644 index 00000000..6607fb37 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/rna_distribution_data.json @@ -0,0 +1,178 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "rna_distribution_data", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "model", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "model" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "model", + "value_set": [ + "AD Diagnosis (males and females)", + "AD Diagnosis x AOD (males and females)", + "AD Diagnosis x Sex (females only)", + "AD Diagnosis x Sex (males only)" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tissue", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "tissue" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "tissue", + "value_set": [ + "CBE", + "DLPFC", + "FP", + "IFG", + "PHG", + "STG", + "TCX", + "ACC", + "PCC" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "min", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "min" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "max", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "max" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "median", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "median" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "first_quartile", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "first_quartile" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "third_quartile", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "third_quartile" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b", + "kwargs": { + "column_A": "max", + "column_B": "third_quartile" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b", + "kwargs": { + "column_A": "third_quartile", + "column_B": "median" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b", + "kwargs": { + "column_A": "median", + "column_B": "first_quartile" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b", + "kwargs": { + "column_A": "first_quartile", + "column_B": "min" + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 8d66b08e..75854767 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -56,6 +56,9 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: return transform.transform_rna_distribution_data(datasets=datasets) if dataset_name == "proteomics_distribution_data": return transform.transform_proteomics_distribution_data(datasets=datasets) + if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]: + df = datasets[dataset_name] + return transform.transform_proteomics(df=df) else: return None @@ -127,7 +130,7 @@ def process_dataset( filename=dataset_name + "." + dataset_obj[dataset_name]["final_format"], ) - gx_enabled = "gx_enabled" in dataset_obj[dataset_name].keys() + gx_enabled = dataset_obj[dataset_name].get("gx_enabled", False) if gx_enabled: gx_runner = GreatExpectationsRunner( diff --git a/test_config.yaml b/test_config.yaml index 53860100..cc1f29a6 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -20,10 +20,10 @@ sources: - agora_proteomics: agora_proteomics_files: &agora_proteomics_files - name: proteomics - id: syn18689335.3 + id: syn18689335.4 format: csv agora_proteomics_provenance: &agora_proteomics_provenance - - syn18689335.3 + - syn18689335.4 - agora_proteomics_tmt: agora_proteomics_tmt_files: &agora_proteomics_tmt_files - name: proteomics_tmt @@ -89,6 +89,7 @@ datasets: - proteomics: files: *agora_proteomics_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_provenance column_rename: genename: hgnc_symbol @@ -99,6 +100,7 @@ datasets: - proteomics_tmt: files: *agora_proteomics_tmt_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_tmt_provenance column_rename: genename: hgnc_symbol @@ -109,6 +111,7 @@ datasets: - proteomics_srm: files: *agora_proteomics_srm_files final_format: json + custom_transformations: 1 provenance: *agora_proteomics_srm_provenance column_rename: genename: hgnc_symbol @@ -290,6 +293,7 @@ datasets: custom_transformations: 1 provenance: *rna_diff_expr_data_provenance destination: *dest + gx_enabled: true - proteomics_distribution_data: files: diff --git a/tests/test_assets/gene_info/input/proteomics_good_input.csv b/tests/test_assets/gene_info/input/proteomics_good_input.csv index 7071f5a2..08c9fd1a 100644 --- a/tests/test_assets/gene_info/input/proteomics_good_input.csv +++ b/tests/test_assets/gene_info/input/proteomics_good_input.csv @@ -15,3 +15,4 @@ DPM1|O60762,DPM1,O60762,ENSG00000000419,AntPFC,0.052,0.215,-0.111,0.734,1.0 GCLC|P48506,,P48506,ENSG00000001084,AntPFC,-0.023,0.077,-0.123,0.848,1.0 CFH|P08603,CFH,P08603,ENSG00000000971,AntPFC,,,,, ,CYP51A1,Q16850,ENSG00000001630,AntPFC,0.265,0.567,-0.037,0.099,0.565 +CON__P35908,CON__P35908,P35908,ENSG00000172867,DLPFC,-0.096819,0.16057,-0.354207,0.764535,1.0 diff --git a/tests/test_assets/proteomics/input/proteomics_lfq_good_input.csv b/tests/test_assets/proteomics/input/proteomics_lfq_good_input.csv new file mode 100644 index 00000000..589cc137 --- /dev/null +++ b/tests/test_assets/proteomics/input/proteomics_lfq_good_input.csv @@ -0,0 +1,7 @@ +uniqid,genename,uniprotid,ensg,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +ACAT1|P24752,ACAT1,P24752,ENSG00000075239,DLPFC,-0.043497787,-0.001436119,-0.085559454,0.039605844,0.121519784 +CON__P35908,CON__P35908,P35908,ENSG00000172867,MFG,-0.096819,0.16057,-0.354207,0.764535,1.0 +MOGS|Q13724-2,MOGS,Q13724-2,ENSG00000115275,MFG,-0.024737651,0.268741708,-0.318217011,0.977232785,0.999999988 +CPLX1|O14810,CPLX1,O14810,ENSG00000168993,TCX,-0.095310355,0.051454069,-0.242074778,0.277450316,0.793276833 +MYH14|Q7Z406,MYH14,Q7Z406,ENSG00000105357,AntPFC,0.036596903,0.187572953,-0.114379147,0.835493315,0.999999904 +CON__P35908,CON__P35908,P35908,ENSG00000172867,DLPFC,-0.096819,0.16057,-0.354207,0.764535,1.0 diff --git a/tests/test_assets/proteomics/input/proteomics_lfq_missing_input.csv b/tests/test_assets/proteomics/input/proteomics_lfq_missing_input.csv new file mode 100644 index 00000000..035cedc3 --- /dev/null +++ b/tests/test_assets/proteomics/input/proteomics_lfq_missing_input.csv @@ -0,0 +1,4 @@ +uniqid,genename,uniprotid,ensg,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +ACAT1|P24752,ACAT1,P24752,ENSG00000075239,DLPFC,-0.043497787,-0.001436119,-0.085559454,0.039605844,0.121519784 +,DDX39B,Q13838,ENSG00000198563,DLPFC,0.067231618,0.137345114,-0.002881877,0.065488223,0.183367023 +FKBP8|Q14318,FKBP8,Q14318,ENSG00000105701,DLPFC,-0.037274408,0.048093291,-0.122642107,0.67121033,1 diff --git a/tests/test_assets/proteomics/input/proteomics_lfq_no_uniqid_input.csv b/tests/test_assets/proteomics/input/proteomics_lfq_no_uniqid_input.csv new file mode 100644 index 00000000..45fef0d3 --- /dev/null +++ b/tests/test_assets/proteomics/input/proteomics_lfq_no_uniqid_input.csv @@ -0,0 +1,4 @@ +bad_field,genename,uniprotid,ensg,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +ACAT1|P24752,ACAT1,P24752,ENSG00000075239,DLPFC,-0.043497787,-0.001436119,-0.085559454,0.039605844,0.121519784 +MOGS|Q13724-2,MOGS,Q13724-2,ENSG00000115275,MFG,-0.024737651,0.268741708,-0.318217011,0.977232785,0.999999988 +FKBP8|Q14318,FKBP8,Q14318,ENSG00000105701,DLPFC,-0.037274408,0.048093291,-0.122642107,0.67121033,1 diff --git a/tests/test_assets/proteomics/input/proteomics_srm_good_input.csv b/tests/test_assets/proteomics/input/proteomics_srm_good_input.csv new file mode 100644 index 00000000..9ba79205 --- /dev/null +++ b/tests/test_assets/proteomics/input/proteomics_srm_good_input.csv @@ -0,0 +1,6 @@ +uniqid,hgnc_symbol,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +UQCR10|Q9UDW1,UQCR10,Q9UDW1,ENSG00000184076,DLPFC,-0.047856325401006046,-0.006795632637535513,-0.08891701816447659,0.017404524464477378,0.050609778954836496 +RUVBL1|Q9Y265,RUVBL1,Q9Y265,ENSG00000175792,DLPFC,0.05453315389554793,0.09014995830291843,0.018916349488177428,0.0009952823691273815,0.0049961119358832295 +CASS4|Q9NQ75,CASS4,Q9NQ75,ENSG00000087589,DLPFC,-0.029649429337509504,0.06993115111279105,-0.12923000978781007,0.7642063208064378,0.9998368407824682 +SLC6A12|P48065,SLC6A12,P48065,ENSG00000111181,DLPFC,0.04243291583556599,0.10994911407693496,-0.025083282405802973,0.30332095961363803,0.5210898536952243 +SPP1|P10451,SPP1,P10451,ENSG00000118785,DLPFC,0.4912046816154135,0.6957173937274684,0.2866919695033585,6.70612867459397e-08,9.97536640345853e-07 diff --git a/tests/test_assets/proteomics/input/proteomics_tmt_good_input.csv b/tests/test_assets/proteomics/input/proteomics_tmt_good_input.csv new file mode 100644 index 00000000..efab3594 --- /dev/null +++ b/tests/test_assets/proteomics/input/proteomics_tmt_good_input.csv @@ -0,0 +1,6 @@ +uniqid,hgnc_symbol,uniprotid,ensembl_gene_id,tissue,log2_fc,ci_upr,ci_lwr,pval,cor_pval +AKAP5|P24588,AKAP5,P24588,ENSG00000179841,DLPFC,-0.0509573858882245,0.0115626499396124,-0.113477421716061,0.109363318498206,0.288347297776616 +CRYZ|Q08257,CRYZ,Q08257,ENSG00000116791,DLPFC,0.0867240045720519,0.186673804841658,-0.0132257956975545,0.0884904979890596,0.253483015194782 +TRMT1|Q9NXH9,TRMT1,Q9NXH9,ENSG00000104907,DLPFC,0.0338755789687381,0.101380502490424,-0.0336293445529475,0.32282128810998,0.550613717538814 +HIKESHI|Q53FT3,HIKESHI,Q53FT3,ENSG00000149196,DLPFC,0.0235456798211915,0.0578409455829091,-0.0107495859405262,0.176890998787692,0.388261871125487 +ATP8A2|Q9NTI2,ATP8A2,Q9NTI2,ENSG00000132932,DLPFC,0.0376416492318968,0.0625255578627375,0.0127577406010561,0.0032829559268753,0.0311245402228597 diff --git a/tests/test_assets/proteomics/output/proteomics_lfq_good_output.json b/tests/test_assets/proteomics/output/proteomics_lfq_good_output.json new file mode 100644 index 00000000..30bd4fe1 --- /dev/null +++ b/tests/test_assets/proteomics/output/proteomics_lfq_good_output.json @@ -0,0 +1,50 @@ +[ + { + "uniqid": "ACAT1|P24752", + "genename": "ACAT1", + "uniprotid": "P24752", + "ensg": "ENSG00000075239", + "tissue": "DLPFC", + "log2_fc": -0.043497787, + "ci_upr": -0.001436119, + "ci_lwr": -0.085559454, + "pval": 0.039605844, + "cor_pval": 0.121519784 + }, + { + "uniqid": "MOGS|Q13724-2", + "genename": "MOGS", + "uniprotid": "Q13724-2", + "ensg": "ENSG00000115275", + "tissue": "MFG", + "log2_fc": -0.024737651, + "ci_upr": 0.268741708, + "ci_lwr": -0.318217011, + "pval": 0.977232785, + "cor_pval": 0.999999988 + }, + { + "uniqid": "CPLX1|O14810", + "genename": "CPLX1", + "uniprotid": "O14810", + "ensg": "ENSG00000168993", + "tissue": "TCX", + "log2_fc": -0.095310355, + "ci_upr": 0.051454069, + "ci_lwr": -0.242074778, + "pval": 0.277450316, + "cor_pval": 0.793276833 + }, + { + "uniqid": "MYH14|Q7Z406", + "genename": "MYH14", + "uniprotid": "Q7Z406", + "ensg": "ENSG00000105357", + "tissue": "AntPFC", + "log2_fc": 0.036596903, + "ci_upr": 0.187572953, + "ci_lwr": -0.114379147, + "pval": 0.835493315, + "cor_pval": 0.999999904 + } +] \ No newline at end of file diff --git a/tests/test_assets/proteomics/output/proteomics_lfq_missing_output.json b/tests/test_assets/proteomics/output/proteomics_lfq_missing_output.json new file mode 100644 index 00000000..31424804 --- /dev/null +++ b/tests/test_assets/proteomics/output/proteomics_lfq_missing_output.json @@ -0,0 +1,26 @@ +[ + { + "uniqid": "ACAT1|P24752", + "genename": "ACAT1", + "uniprotid": "P24752", + "ensg": "ENSG00000075239", + "tissue": "DLPFC", + "log2_fc": -0.043497787, + "ci_upr": -0.001436119, + "ci_lwr": -0.085559454, + "pval": 0.039605844, + "cor_pval": 0.121519784 + }, + { + "uniqid": "FKBP8|Q14318", + "genename": "FKBP8", + "uniprotid": "Q14318", + "ensg": "ENSG00000105701", + "tissue": "DLPFC", + "log2_fc": -0.037274408, + "ci_upr": 0.048093291, + "ci_lwr": -0.122642107, + "pval": 0.67121033, + "cor_pval": 1 + } +] \ No newline at end of file diff --git a/tests/test_assets/proteomics/output/proteomics_srm_good_output.json b/tests/test_assets/proteomics/output/proteomics_srm_good_output.json new file mode 100644 index 00000000..5dcf6fc1 --- /dev/null +++ b/tests/test_assets/proteomics/output/proteomics_srm_good_output.json @@ -0,0 +1,62 @@ +[ + { + "uniqid": "UQCR10|Q9UDW1", + "hgnc_symbol": "UQCR10", + "uniprotid": "Q9UDW1", + "ensembl_gene_id": "ENSG00000184076", + "tissue": "DLPFC", + "log2_fc": -0.047856325401006046, + "ci_upr": -0.006795632637535513, + "ci_lwr": -0.08891701816447659, + "pval": 0.017404524464477378, + "cor_pval": 0.050609778954836496 + }, + { + "uniqid": "RUVBL1|Q9Y265", + "hgnc_symbol": "RUVBL1", + "uniprotid": "Q9Y265", + "ensembl_gene_id": "ENSG00000175792", + "tissue": "DLPFC", + "log2_fc": 0.05453315389554793, + "ci_upr": 0.09014995830291843, + "ci_lwr": 0.018916349488177428, + "pval": 0.0009952823691273815, + "cor_pval": 0.0049961119358832295 + }, + { + "uniqid": "CASS4|Q9NQ75", + "hgnc_symbol": "CASS4", + "uniprotid": "Q9NQ75", + "ensembl_gene_id": "ENSG00000087589", + "tissue": "DLPFC", + "log2_fc": -0.029649429337509504, + "ci_upr": 0.06993115111279105, + "ci_lwr": -0.12923000978781007, + "pval": 0.7642063208064378, + "cor_pval": 0.9998368407824682 + }, + { + "uniqid": "SLC6A12|P48065", + "hgnc_symbol": "SLC6A12", + "uniprotid": "P48065", + "ensembl_gene_id": "ENSG00000111181", + "tissue": "DLPFC", + "log2_fc": 0.04243291583556599, + "ci_upr": 0.10994911407693496, + "ci_lwr": -0.025083282405802973, + "pval": 0.30332095961363803, + "cor_pval": 0.5210898536952243 + }, + { + "uniqid": "SPP1|P10451", + "hgnc_symbol": "SPP1", + "uniprotid": "P10451", + "ensembl_gene_id": "ENSG00000118785", + "tissue": "DLPFC", + "log2_fc": 0.4912046816154135, + "ci_upr": 0.6957173937274684, + "ci_lwr": 0.2866919695033585, + "pval": 6.70612867459397e-08, + "cor_pval": 9.97536640345853e-07 + } +] \ No newline at end of file diff --git a/tests/test_assets/proteomics/output/proteomics_tmt_good_output.json b/tests/test_assets/proteomics/output/proteomics_tmt_good_output.json new file mode 100644 index 00000000..9862f30a --- /dev/null +++ b/tests/test_assets/proteomics/output/proteomics_tmt_good_output.json @@ -0,0 +1,62 @@ +[ + { + "uniqid": "AKAP5|P24588", + "hgnc_symbol": "AKAP5", + "uniprotid": "P24588", + "ensembl_gene_id": "ENSG00000179841", + "tissue": "DLPFC", + "log2_fc": -0.0509573858882245, + "ci_upr": 0.0115626499396124, + "ci_lwr": -0.113477421716061, + "pval": 0.109363318498206, + "cor_pval": 0.288347297776616 + }, + { + "uniqid": "CRYZ|Q08257", + "hgnc_symbol": "CRYZ", + "uniprotid": "Q08257", + "ensembl_gene_id": "ENSG00000116791", + "tissue": "DLPFC", + "log2_fc": 0.0867240045720519, + "ci_upr": 0.186673804841658, + "ci_lwr": -0.0132257956975545, + "pval": 0.0884904979890596, + "cor_pval": 0.253483015194782 + }, + { + "uniqid": "TRMT1|Q9NXH9", + "hgnc_symbol": "TRMT1", + "uniprotid": "Q9NXH9", + "ensembl_gene_id": "ENSG00000104907", + "tissue": "DLPFC", + "log2_fc": 0.0338755789687381, + "ci_upr": 0.101380502490424, + "ci_lwr": -0.0336293445529475, + "pval": 0.32282128810998, + "cor_pval": 0.550613717538814 + }, + { + "uniqid": "HIKESHI|Q53FT3", + "hgnc_symbol": "HIKESHI", + "uniprotid": "Q53FT3", + "ensembl_gene_id": "ENSG00000149196", + "tissue": "DLPFC", + "log2_fc": 0.0235456798211915, + "ci_upr": 0.0578409455829091, + "ci_lwr": -0.0107495859405262, + "pval": 0.176890998787692, + "cor_pval": 0.388261871125487 + }, + { + "uniqid": "ATP8A2|Q9NTI2", + "hgnc_symbol": "ATP8A2", + "uniprotid": "Q9NTI2", + "ensembl_gene_id": "ENSG00000132932", + "tissue": "DLPFC", + "log2_fc": 0.0376416492318968, + "ci_upr": 0.0625255578627375, + "ci_lwr": 0.0127577406010561, + "pval": 0.0032829559268753, + "cor_pval": 0.0311245402228597 + } +] \ No newline at end of file diff --git a/tests/test_assets/proteomics_distribution_data/input/test_proteomics_distribution_lfq_good_input.csv b/tests/test_assets/proteomics_distribution_data/input/test_proteomics_distribution_lfq_good_input.csv index 0f257ccb..08c39f08 100644 --- a/tests/test_assets/proteomics_distribution_data/input/test_proteomics_distribution_lfq_good_input.csv +++ b/tests/test_assets/proteomics_distribution_data/input/test_proteomics_distribution_lfq_good_input.csv @@ -15,4 +15,5 @@ CPLX1|O14810,CPLX1,O14810,ENSG00000168993,TCX,-0.095310355,0.051454069,-0.242074 KPNA1|P52294,KPNA1,P52294,ENSG00000114030,AntPFC,0.023613726,0.194829171,-0.147601719,0.943425986,0.999999904 CRYZ|Q08257,CRYZ,Q08257,ENSG00000116791,AntPFC,-0.045658922,0.092475201,-0.183793045,0.716192618,0.999999904 CPSF6|Q16630-3,CPSF6,Q16630-3,ENSG00000111605,AntPFC,-0.08756701,0.162699641,-0.337833662,0.688026768,0.999999904 -MYH14|Q7Z406,MYH14,Q7Z406,ENSG00000105357,AntPFC,0.036596903,0.187572953,-0.114379147,0.835493315,0.999999904 \ No newline at end of file +MYH14|Q7Z406,MYH14,Q7Z406,ENSG00000105357,AntPFC,0.036596903,0.187572953,-0.114379147,0.835493315,0.999999904 +CON__P35908,CON__P35908,P35908,ENSG00000172867,DLPFC,-0.096819,0.16057,-0.354207,0.764535,1.0 diff --git a/tests/test_process.py b/tests/test_process.py index 07a3bc50..7dd389d6 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -66,7 +66,17 @@ class TestProcessDataset: } } - def setup_method(self, syn): + dataset_object_gx_disabled = { + "neuropath_corr": { + "files": [{"name": "test_file_1", "id": "syn1111111", "format": "csv"}], + "final_format": "json", + "provenance": ["syn1111111"], + "destination": "syn1111113", + "gx_enabled": False, + } + } + + def setup_method(self): self.patch_get_entity_as_df = patch.object( extract, "get_entity_as_df", return_value=pd.DataFrame ).start() @@ -114,7 +124,7 @@ def teardown_method(self): self.patch_format_link.stop() mock.patch.stopall() - def test_process_dataset_upload_false_gx_disabled(self, syn: Any): + def test_process_dataset_upload_false_gx_not_specified(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object, staging_path=STAGING_PATH, @@ -142,7 +152,9 @@ def test_process_dataset_upload_false_gx_disabled(self, syn: Any): self.patch_format_link.assert_not_called() self.patch_load.assert_not_called() - def test_process_dataset_upload_false_gx_disabled_column_rename(self, syn: Any): + def test_process_dataset_upload_false_gx_not_specified_column_rename( + self, syn: Any + ): process.process_dataset( dataset_obj=self.dataset_object_col_rename, staging_path=STAGING_PATH, @@ -172,7 +184,7 @@ def test_process_dataset_upload_false_gx_disabled_column_rename(self, syn: Any): self.patch_format_link.assert_not_called() self.patch_load.assert_not_called() - def test_process_dataset_upload_false_gx_disabled_custom_transformations( + def test_process_dataset_upload_false_gx_not_specified_custom_transformations( self, syn: Any ): process.process_dataset( @@ -214,7 +226,9 @@ def test_process_dataset_upload_false_gx_disabled_custom_transformations( # This test looks like a duplicate of test_process_dataset_upload_false_gx_disabled # but it uses the agora_rename configuration with the same util function - def test_process_dataset_upload_false_gx_disabled_with_agora_rename(self, syn: Any): + def test_process_dataset_upload_false_gx_not_specified_with_agora_rename( + self, syn: Any + ): process.process_dataset( dataset_obj=self.dataset_object_col_rename, staging_path=STAGING_PATH, @@ -244,7 +258,7 @@ def test_process_dataset_upload_false_gx_disabled_with_agora_rename(self, syn: A self.patch_format_link.assert_not_called() self.patch_load.assert_not_called() - def test_process_dataset_upload_false_gx_disabled_type_dict(self, syn: Any): + def test_process_dataset_upload_false_gx_not_specified_type_dict(self, syn: Any): self.patch_standardize_values.return_value = dict() process.process_dataset( dataset_obj=self.dataset_object, @@ -275,7 +289,7 @@ def test_process_dataset_upload_false_gx_disabled_type_dict(self, syn: Any): def test_process_dataset_upload_true_gx_disabled(self, syn: Any): process.process_dataset( - dataset_obj=self.dataset_object, + dataset_obj=self.dataset_object_gx_disabled, staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, diff --git a/tests/transform/test_proteomics.py b/tests/transform/test_proteomics.py new file mode 100644 index 00000000..1302257d --- /dev/null +++ b/tests/transform/test_proteomics.py @@ -0,0 +1,82 @@ +"""Integration test for the proteomics LFQ transform. +The transform should successfully filter out proteins that start with "CON__" and should remove +rows that are missing a uniqid value. The only failure case for this transform is when "uniqid" +is not a column in the data frame. +""" + +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform import proteomics + + +class TestTranformProteomics: + """Class for testing the transform. + ADT currently ingests three proteomics data sets (LFQ, TMT, SRM) and runs the transform on each. + Currently only LFQ data is actually modified by the transform, so the TMT and SRM test data + should not be changed by the transform. + """ + + data_files_path = "tests/test_assets/proteomics" + pass_test_data = [ + ( # pass with good data (LFQ) + "proteomics_lfq_good_input.csv", + "proteomics_lfq_good_output.json", + ), + ( # pass with missing data (LFQ) + "proteomics_lfq_missing_input.csv", + "proteomics_lfq_missing_output.json", + ), + ( # pass with good data (TMT) + "proteomics_tmt_good_input.csv", + "proteomics_tmt_good_output.json", + ), + ( # pass with good data (SRM) + "proteomics_srm_good_input.csv", + "proteomics_srm_good_output.json", + ), + ] + pass_test_ids = [ + "Pass with good data (LFQ)", + "Pass with missing data (LFQ)", + "Pass with good data (TMT)", + "Pass with good data (SRM)", + ] + fail_test_data = [ + "proteomics_lfq_no_uniqid_input.csv", + ] + fail_test_ids = [ + "Fail with missing uniqid column", + ] + + @pytest.mark.parametrize( + "input_file, expected_output_file", pass_test_data, ids=pass_test_ids + ) + def test_transform_proteomics_should_pass( + self, input_file: str, expected_output_file: str + ) -> None: + """Passing conditions: "CON__" proteins and proteins with NA uniqids are removed, all + other rows are left intact. + """ + input_df = pd.read_csv(os.path.join(self.data_files_path, "input", input_file)) + + # reset_index is necessary because the index values need to match the expected output, but + # if rows are removed from the output, the index values will differ. + output_df = proteomics.transform_proteomics(df=input_df).reset_index(drop=True) + expected_df = pd.read_json( + os.path.join(self.data_files_path, "output", expected_output_file), + ) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize("input_file", fail_test_data, ids=fail_test_ids) + def test_transform_proteomics_should_fail(self, input_file: str) -> None: + """Failure condition: "uniqid" is not a column in the data frame. + This should throw a KeyError. + """ + with pytest.raises(KeyError): + input_df = pd.read_csv( + os.path.join(self.data_files_path, "input", input_file) + ) + proteomics.transform_proteomics(df=input_df) diff --git a/tests/transform/test_proteomics_distribution_data.py b/tests/transform/test_proteomics_distribution_data.py index 5281e845..941c1d04 100644 --- a/tests/transform/test_proteomics_distribution_data.py +++ b/tests/transform/test_proteomics_distribution_data.py @@ -7,10 +7,9 @@ # NOTE: This test's input is structured a little differently than the other transform -# tests because we may have up to 3 input files with specific dataset names but the -# test should work with the current 2 we support, and won't require modifying the -# test functions to add SRM data later. This structure also lets us test what happens -# when we input a file with an unsupported proteomics type. +# tests because we may have up to 3 input files with specific dataset names but can have fewer, as +# there used to be only 2 proteomics data sets. This structure also lets us test what happens when +# we input a file with an unsupported proteomics type. class TestTransformProteomicsDistributionData: data_files_path = "tests/test_assets/proteomics_distribution_data" pass_test_data = [