Skip to content

Commit

Permalink
Merge pull request #79 from Sage-Bionetworks/jbeck/AG-1157/create_bio…
Browse files Browse the repository at this point in the history
…domains_info_transform

Added transform for biodomain_info
  • Loading branch information
jaclynbeck-sage authored Jun 16, 2023
2 parents 5b7f26a + fe54746 commit 9868ff2
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 0 deletions.
13 changes: 13 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
- destination: &dest syn12177492
- staging_path: ./staging
- datasets:
- biodomain_info:
files:
- name: genes_biodomains
id: syn44151254.1
format: csv
final_format: json
custom_transformations: 1
provenance:
- syn44151254.1
column_rename:
biodomain: name
destination: *dest

- genes_biodomains:
files:
- name: genes_biodomains
Expand Down
4 changes: 4 additions & 0 deletions src/agoradatatools/etl/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
transform_distribution_data,
)
from agoradatatools.etl.transform.gene_info import transform_gene_info
from agoradatatools.etl.transform.biodomain_info import (
transform_biodomain_info,
)
from agoradatatools.etl.transform.genes_biodomains import (
transform_genes_biodomains,
)
Expand All @@ -22,6 +25,7 @@
__all__ = [
"transform_distribution_data",
"transform_gene_info",
"transform_biodomain_info",
"transform_genes_biodomains",
"transform_overall_scores",
"create_proteomics_distribution_data",
Expand Down
25 changes: 25 additions & 0 deletions src/agoradatatools/etl/transform/biodomain_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd


def transform_biodomain_info(datasets: dict) -> pd.DataFrame:
"""Takes dictionary of dataset DataFrames, extracts the genes_biodomains
DataFrame, gets a unique list of biodomain names, and outputs the list as
a single-column DataFrame with column "name".
Args:
datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame
Returns:
pd.DataFrame: 1-column DataFrame with column "name"
"""
genes_biodomains = datasets["genes_biodomains"]
biodomain_info = (
genes_biodomains["name"]
.dropna()
.drop_duplicates()
.reset_index()
.drop(columns="index")
.sort_values(by="name", ignore_index=True)
)

return biodomain_info
2 changes: 2 additions & 0 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict):
if not isinstance(datasets, dict) or not isinstance(dataset_name, str):
return None
if dataset_name == "biodomain_info":
return transform.transform_biodomain_info(datasets=datasets)
if dataset_name == "genes_biodomains":
return transform.transform_genes_biodomains(datasets=datasets)
if dataset_name == "overall_scores":
Expand Down
13 changes: 13 additions & 0 deletions test_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
- destination: &dest syn17015333
- staging_path: ./staging
- datasets:
- biodomain_info:
files:
- name: genes_biodomains
id: syn44151254.1
format: csv
final_format: json
custom_transformations: 1
provenance:
- syn44151254.1
column_rename:
biodomain: name
destination: *dest

- genes_biodomains:
files:
- name: genes_biodomains
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name,abbr,label,color,go_id,goterm_name,ensembl_id
Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0048199,"vesicle targeting, to, from or within Golgi",ENSG00000170348
Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0007030,Golgi organization,ENSG00000243414
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000115008
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000278006
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000275313
Tau Homeostasis,TH,Tau Homeostasis [TH],#cb97cb,GO:1902988,neurofibrillary tangle assembly,ENSG00000186868
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name,abbr,label,color,go_id,goterm_name,ensembl_id
,Pr,Proteostasis [Pr],#c8b269,GO:0048199,"vesicle targeting, to, from or within Golgi",ENSG00000170348
Proteostasis,,Proteostasis [Pr],#c8b269,GO:0007030,Golgi organization,ENSG00000243414
Immune Response,IR,,#9ccdcc,GO:0006955,immune response,
Immune Response,IR,Immune Response [IR],,GO:0006955,immune response,ENSG00000278006
Immune Response,IR,Immune Response [IR],#9ccdcc,,immune response,ENSG00000275313
Tau Homeostasis,TH,Tau Homeostasis [TH],#cb97cb,GO:1902988,,ENSG00000186868
,Ap,Apoptosis [Ap],#673399,GO:0006915,apoptotic process,ENSG00000125538
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"name": "Immune Response"
},
{
"name": "Proteostasis"
},
{
"name": "Tau Homeostasis"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"name": "Immune Response"
},
{
"name": "Proteostasis"
},
{
"name": "Tau Homeostasis"
}
]
57 changes: 57 additions & 0 deletions tests/transform/test_biodomain_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import pandas as pd
import pytest

from agoradatatools.etl.transform import biodomain_info


class TestTransformBiodomainInfo:
data_files_path = "tests/test_assets/biodomain_info"
pass_test_data = [
( # Pass with good data
"biodomain_info_good_input.csv",
"biodomain_info_good_output.json",
),
( # Pass with values missing from each column
"biodomain_info_imperfect_input.csv",
"biodomain_info_imperfect_output.json",
),
]
pass_test_ids = [
"Pass with good data",
"Pass with missing values in each column",
]
fail_test_data = [
# No failure cases for this transform
]
fail_test_ids = [
# No failure cases for this transform
]

@pytest.mark.parametrize(
"biodomain_info_file, expected_output_file", pass_test_data, ids=pass_test_ids
)
def test_transform_biodomain_info_should_pass(
self, biodomain_info_file, expected_output_file
):
biodomain_info_df = pd.read_csv(
os.path.join(self.data_files_path, "input", biodomain_info_file)
)
output_df = biodomain_info.transform_biodomain_info(
datasets={"genes_biodomains": biodomain_info_df}
)
expected_df = pd.read_json(
os.path.join(self.data_files_path, "output", expected_output_file)
)
pd.testing.assert_frame_equal(output_df, expected_df)

"""
# Leaving code stub for failure case, in case we want to add this in the future
@pytest.mark.parametrize("biodomain_info_file", fail_test_data, ids=fail_test_ids)
def test_transform_biodomain_info_should_fail(self, biodomain_info_file):
with pytest.raises(<Error type>):
biodomain_info_df = pd.read_csv(os.path.join(self.data_files_path, "input", biodomain_info_file))
biodomain_info.transform_biodomain_info(
datasets={"genes_biodomains": biodomain_info_df}
)
"""

0 comments on commit 9868ff2

Please sign in to comment.