diff --git a/config.yaml b/config.yaml index 96aa0e51..19b4175b 100644 --- a/config.yaml +++ b/config.yaml @@ -100,6 +100,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest + gx_folder: syn53469660 - proteomics_srm: files: *agora_proteomics_srm_files diff --git a/gx_suite_definitions/proteomics_tmt.ipynb b/gx_suite_definitions/proteomics_tmt.ipynb new file mode 100644 index 00000000..3b6e8a41 --- /dev/null +++ b/gx_suite_definitions/proteomics_tmt.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import pandas as pd\n", + "import great_expectations as gx\n", + "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Proteomics TMT Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proteomics_tmt_file = syn.get(\"syn32210527\").path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json(proteomics_tmt_file)\n", + "nested_columns = []\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"proteomics_tmt\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uniqid\n", + "validator.expect_column_values_to_be_of_type(\"uniqid\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"uniqid\", 1, 25)\n", + "validator.expect_column_values_to_match_regex(\"uniqid\", \"^[a-zA-Z0-9.]+?|[a-zA-Z0-9-]+$\")\n", + "validator.expect_column_values_to_be_unique(\"uniqid\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hgnc_symbol\n", + "validator.expect_column_values_to_be_of_type(\"hgnc_symbol\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"hgnc_symbol\", 1, 15)\n", + "validator.expect_column_values_to_match_regex(\"hgnc_symbol\", \"^[a-zA-Z0-9.-]*$\")\n", + "validator.expect_column_values_to_not_be_null(\"hgnc_symbol\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uniprotid\n", + "validator.expect_column_values_to_be_of_type(\"uniprotid\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"uniprotid\", 1, 15)\n", + "validator.expect_column_values_to_match_regex(\"uniprotid\", \"^[a-zA-Z0-9.-]*$\")\n", + "validator.expect_column_values_to_be_unique(\"uniprotid\")\n", + "validator.expect_column_values_to_not_be_null(\"uniprotid\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ensembl_gene_id\n", + "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n", + "validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n", + "# checks format and allowed chatacters\n", + "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tissue\n", + "validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n", + "validator.expect_column_value_lengths_to_be_between(\"tissue\", 1, 15)\n", + "validator.expect_column_values_to_be_in_set(\"tissue\", [\"AntPFC\", \"DLPFC\", \"MFG\", \"TCX\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# log2_fc\n", + "validator.expect_column_values_to_be_of_type(\"log2_fc\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"log2_fc\", -0.5, 1.5)\n", + "validator.expect_column_values_to_not_be_null(\"log2_fc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ci_upr\n", + "validator.expect_column_values_to_be_of_type(\"ci_upr\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"ci_upr\", -0.5, 2)\n", + "validator.expect_column_values_to_not_be_null(\"ci_upr\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ci_lwr\n", + "validator.expect_column_values_to_be_of_type(\"ci_lwr\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"ci_lwr\", -1, 1.5)\n", + "validator.expect_column_values_to_not_be_null(\"ci_lwr\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# pval\n", + "validator.expect_column_values_to_be_of_type(\"pval\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"pval\", 0, 1)\n", + "validator.expect_column_values_to_not_be_null(\"pval\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# cor_pval\n", + "validator.expect_column_values_to_be_of_type(\"cor_pval\", \"float\")\n", + "validator.expect_column_values_to_be_between(\"cor_pval\", 0, 1)\n", + "validator.expect_column_values_to_not_be_null(\"cor_pval\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# multi-field\n", + "validator.expect_column_pair_values_a_to_be_greater_than_b(\"ci_upr\", \"ci_lwr\")\n", + "validator.expect_compound_columns_to_be_unique([\"uniqid\", \"tissue\"])\n", + "validator.expect_compound_columns_to_be_unique([\"hgnc_symbol\", \"uniprotid\", \"tissue\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Expectation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "agora-data-tools-CK0oUlHB", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/great_expectations/gx/expectations/proteomics_tmt.json b/src/agoradatatools/great_expectations/gx/expectations/proteomics_tmt.json new file mode 100644 index 00000000..164e3b82 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/proteomics_tmt.json @@ -0,0 +1,323 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "proteomics_tmt", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "uniqid", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "uniqid", + "max_value": 25, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "uniqid", + "regex": "^[a-zA-Z0-9.]+?|[a-zA-Z0-9-]+$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "uniqid" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "hgnc_symbol", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "hgnc_symbol", + "max_value": 15, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "hgnc_symbol", + "regex": "^[a-zA-Z0-9.-]*$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "hgnc_symbol" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "uniprotid", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "uniprotid", + "max_value": 15, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "uniprotid", + "regex": "^[a-zA-Z0-9.-]*$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_unique", + "kwargs": { + "column": "uniprotid" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "uniprotid" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ensembl_gene_id", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "ensembl_gene_id" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_equal", + "kwargs": { + "column": "ensembl_gene_id", + "value": 15 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_regex", + "kwargs": { + "column": "ensembl_gene_id", + "regex": "^ENSG\\d{11}$" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tissue", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_value_lengths_to_be_between", + "kwargs": { + "column": "tissue", + "max_value": 15, + "min_value": 1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "kwargs": { + "column": "tissue", + "value_set": [ + "AntPFC", + "DLPFC", + "MFG", + "TCX" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "log2_fc", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "log2_fc", + "max_value": 1.5, + "min_value": -0.5 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "log2_fc" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ci_upr", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "ci_upr", + "max_value": 2, + "min_value": -0.5 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "ci_upr" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "ci_lwr", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "ci_lwr", + "max_value": 1.5, + "min_value": -1 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "ci_lwr" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "pval", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "pval", + "max_value": 1, + "min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "pval" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "cor_pval", + "type_": "float" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "cor_pval", + "max_value": 1, + "min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "cor_pval" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b", + "kwargs": { + "column_A": "ci_upr", + "column_B": "ci_lwr" + }, + "meta": {} + }, + { + "expectation_type": "expect_compound_columns_to_be_unique", + "kwargs": { + "column_list": [ + "uniqid", + "tissue" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_compound_columns_to_be_unique", + "kwargs": { + "column_list": [ + "hgnc_symbol", + "uniprotid", + "tissue" + ] + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/test_config.yaml b/test_config.yaml index a363b1ef..3991f229 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -100,6 +100,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest + gx_folder: syn53469659 - proteomics_srm: files: *agora_proteomics_srm_files