Skip to content

Commit

Permalink
Merge branch 'dev' into bwmac/IBCDPE-794/proteomics_gx
Browse files Browse the repository at this point in the history
  • Loading branch information
BWMac committed Sep 16, 2024
2 parents ebcb04b + 7c972a3 commit 50010f2
Show file tree
Hide file tree
Showing 23 changed files with 808 additions and 21 deletions.
8 changes: 6 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ sources:
- agora_proteomics:
agora_proteomics_files: &agora_proteomics_files
- name: proteomics
id: syn18689335.3
id: syn18689335.4
format: csv
agora_proteomics_provenance: &agora_proteomics_provenance
- syn18689335.3
- syn18689335.4
- agora_proteomics_tmt:
agora_proteomics_tmt_files: &agora_proteomics_tmt_files
- name: proteomics_tmt
Expand Down Expand Up @@ -89,6 +89,7 @@ datasets:
- proteomics:
files: *agora_proteomics_files
final_format: json
custom_transformations: 1
provenance: *agora_proteomics_provenance
column_rename:
genename: hgnc_symbol
Expand All @@ -99,6 +100,7 @@ datasets:
- proteomics_tmt:
files: *agora_proteomics_tmt_files
final_format: json
custom_transformations: 1
provenance: *agora_proteomics_tmt_provenance
column_rename:
genename: hgnc_symbol
Expand All @@ -109,6 +111,7 @@ datasets:
- proteomics_srm:
files: *agora_proteomics_srm_files
final_format: json
custom_transformations: 1
provenance: *agora_proteomics_srm_provenance
column_rename:
genename: hgnc_symbol
Expand Down Expand Up @@ -290,6 +293,7 @@ datasets:
custom_transformations: 1
provenance: *rna_diff_expr_data_provenance
destination: *dest
gx_enabled: true

- proteomics_distribution_data:
files:
Expand Down
243 changes: 243 additions & 0 deletions gx_suite_definitions/rna_distributinon_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import synapseclient\n",
"\n",
"import great_expectations as gx\n",
"\n",
"context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create Expectation Suite for RNA Distribution Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Example Data File"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"syn = synapseclient.Synapse()\n",
"syn.login()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rna_distribution_data_file = syn.get(\"syn28094691\").path\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Validator Object on Data File"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"validator = context.sources.pandas_default.read_json(\n",
" rna_distribution_data_file\n",
")\n",
"validator.expectation_suite_name = \"rna_distribution_data\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add Expectations to Validator Object For Each Column"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# model\n",
"validator.expect_column_values_to_be_of_type(\"model\", \"str\")\n",
"validator.expect_column_values_to_not_be_null(\"model\")\n",
"validator.expect_column_values_to_be_in_set(\"model\", [\"AD Diagnosis (males and females)\", \"AD Diagnosis x AOD (males and females)\",\"AD Diagnosis x Sex (females only)\", \"AD Diagnosis x Sex (males only)\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# tissue\n",
"validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n",
"validator.expect_column_values_to_not_be_null(\"tissue\")\n",
"validator.expect_column_values_to_be_in_set(\"tissue\", [\"CBE\", \"DLPFC\", \"FP\", \"IFG\", \"PHG\", \"STG\", \"TCX\", \"ACC\", \"PCC\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# min\n",
"validator.expect_column_values_to_be_of_type(\"min\", \"float\")\n",
"validator.expect_column_values_to_not_be_null(\"min\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# max\n",
"validator.expect_column_values_to_be_of_type(\"max\", \"float\")\n",
"validator.expect_column_values_to_not_be_null(\"max\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# median\n",
"validator.expect_column_values_to_be_of_type(\"median\", \"float\")\n",
"validator.expect_column_values_to_not_be_null(\"median\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# first_quartile\n",
"validator.expect_column_values_to_be_of_type(\"first_quartile\", \"float\")\n",
"validator.expect_column_values_to_not_be_null(\"first_quartile\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# third_quartile\n",
"validator.expect_column_values_to_be_of_type(\"third_quartile\", \"float\")\n",
"validator.expect_column_values_to_not_be_null(\"third_quartile\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# multi-field\n",
"validator.expect_column_pair_values_a_to_be_greater_than_b(\"max\", \"third_quartile\")\n",
"validator.expect_column_pair_values_a_to_be_greater_than_b(\"third_quartile\", \"median\")\n",
"validator.expect_column_pair_values_a_to_be_greater_than_b(\"median\", \"first_quartile\")\n",
"validator.expect_column_pair_values_a_to_be_greater_than_b(\"first_quartile\", \"min\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save Expectation Suite"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"validator.save_expectation_suite(discard_failed_expectations=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Checkpoint and View Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"checkpoint = context.add_or_update_checkpoint(\n",
" name=\"agora-test-checkpoint\",\n",
" validator=validator,\n",
")\n",
"checkpoint_result = checkpoint.run()\n",
"context.view_validation_result(checkpoint_result)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build Data Docs - Click on Expectation Suite to View All Expectations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"context.build_data_docs()\n",
"context.open_data_docs()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 2 additions & 0 deletions src/agoradatatools/etl/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
transform_rnaseq_differential_expression,
)
from agoradatatools.etl.transform.team_info import transform_team_info
from agoradatatools.etl.transform.proteomics import transform_proteomics

__all__ = [
"transform_distribution_data",
Expand All @@ -26,4 +27,5 @@
"transform_rna_distribution_data",
"transform_rnaseq_differential_expression",
"transform_team_info",
"transform_proteomics",
]
7 changes: 4 additions & 3 deletions src/agoradatatools/etl/transform/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

from agoradatatools.etl.utils import nest_fields
from agoradatatools.etl import transform


def transform_gene_info(
Expand All @@ -14,10 +15,10 @@ def transform_gene_info(
gene_metadata = datasets["gene_metadata"]
igap = datasets["igap"]
eqtl = datasets["eqtl"]
proteomics = datasets["proteomics"]
proteomics = transform.transform_proteomics(df=datasets["proteomics"])
rna_change = datasets["diff_exp_data"]
proteomics_tmt = datasets["proteomics_tmt"]
proteomics_srm = datasets["proteomics_srm"]
proteomics_tmt = transform.transform_proteomics(df=datasets["proteomics_tmt"])
proteomics_srm = transform.transform_proteomics(df=datasets["proteomics_srm"])
target_list = datasets["target_list"]
median_expression = datasets["median_expression"]
druggability = datasets["druggability"]
Expand Down
25 changes: 25 additions & 0 deletions src/agoradatatools/etl/transform/proteomics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Function for transforming proteomics data. This function is called on all three proteomics
data sets, although currently it only affects the LFQ data set as it is the only one with "CON__"
entries.
"""

import pandas as pd


def transform_proteomics(df: pd.DataFrame) -> pd.DataFrame:
"""Filters out rows that have "CON__" in their uniqid. This label indicates that the protein
is a known contaminant and should be removed from the final data set. Rows with an NA uniqid
are also removed.
Args:
df (pd.DataFrame]): pandas DataFrame containing proteomics data. Must contain a column
called "uniqid".
Returns:
pd.DataFrame: a DataFrame that is identical to the input DataFrame but with rows containing
"CON__" in the uniqid removed.
"""
# Using "na=True" causes rows with NA uniqids to be set to True so they get removed
remove_rows = df["uniqid"].str.contains("CON__", na=True)
df = df.drop(df.index[remove_rows])
return df
5 changes: 4 additions & 1 deletion src/agoradatatools/etl/transform/proteomics_distribution.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

from agoradatatools.etl import utils
from agoradatatools.etl import utils, transform


def transform_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:
Expand All @@ -18,6 +18,9 @@ def transform_proteomics_distribution_data(datasets: dict) -> pd.DataFrame:
"""
transformed = []
for name, dataset in datasets.items():
# Remove contaminant ("CON__") entries and rows with NA uniqids before calculating distribution
dataset = transform.transform_proteomics(df=dataset)

df = utils.calculate_distribution(
df=dataset, grouping="tissue", distribution_column="log2_fc"
)
Expand Down
Loading

0 comments on commit 50010f2

Please sign in to comment.