-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #79 from Sage-Bionetworks/jbeck/AG-1157/create_bio…
…domains_info_transform Added transform for biodomain_info
- Loading branch information
Showing
10 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import pandas as pd | ||
|
||
|
||
def transform_biodomain_info(datasets: dict) -> pd.DataFrame: | ||
"""Takes dictionary of dataset DataFrames, extracts the genes_biodomains | ||
DataFrame, gets a unique list of biodomain names, and outputs the list as | ||
a single-column DataFrame with column "name". | ||
Args: | ||
datasets (dict[str, pd.DataFrame]): dictionary of dataset names mapped to their DataFrame | ||
Returns: | ||
pd.DataFrame: 1-column DataFrame with column "name" | ||
""" | ||
genes_biodomains = datasets["genes_biodomains"] | ||
biodomain_info = ( | ||
genes_biodomains["name"] | ||
.dropna() | ||
.drop_duplicates() | ||
.reset_index() | ||
.drop(columns="index") | ||
.sort_values(by="name", ignore_index=True) | ||
) | ||
|
||
return biodomain_info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 changes: 7 additions & 0 deletions
7
tests/test_assets/biodomain_info/input/biodomain_info_good_input.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
name,abbr,label,color,go_id,goterm_name,ensembl_id | ||
Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0048199,"vesicle targeting, to, from or within Golgi",ENSG00000170348 | ||
Proteostasis,Pr,Proteostasis [Pr],#c8b269,GO:0007030,Golgi organization,ENSG00000243414 | ||
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000115008 | ||
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000278006 | ||
Immune Response,IR,Immune Response [IR],#9ccdcc,GO:0006955,immune response,ENSG00000275313 | ||
Tau Homeostasis,TH,Tau Homeostasis [TH],#cb97cb,GO:1902988,neurofibrillary tangle assembly,ENSG00000186868 |
8 changes: 8 additions & 0 deletions
8
tests/test_assets/biodomain_info/input/biodomain_info_imperfect_input.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
name,abbr,label,color,go_id,goterm_name,ensembl_id | ||
,Pr,Proteostasis [Pr],#c8b269,GO:0048199,"vesicle targeting, to, from or within Golgi",ENSG00000170348 | ||
Proteostasis,,Proteostasis [Pr],#c8b269,GO:0007030,Golgi organization,ENSG00000243414 | ||
Immune Response,IR,,#9ccdcc,GO:0006955,immune response, | ||
Immune Response,IR,Immune Response [IR],,GO:0006955,immune response,ENSG00000278006 | ||
Immune Response,IR,Immune Response [IR],#9ccdcc,,immune response,ENSG00000275313 | ||
Tau Homeostasis,TH,Tau Homeostasis [TH],#cb97cb,GO:1902988,,ENSG00000186868 | ||
,Ap,Apoptosis [Ap],#673399,GO:0006915,apoptotic process,ENSG00000125538 |
11 changes: 11 additions & 0 deletions
11
tests/test_assets/biodomain_info/output/biodomain_info_good_output.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[ | ||
{ | ||
"name": "Immune Response" | ||
}, | ||
{ | ||
"name": "Proteostasis" | ||
}, | ||
{ | ||
"name": "Tau Homeostasis" | ||
} | ||
] |
11 changes: 11 additions & 0 deletions
11
tests/test_assets/biodomain_info/output/biodomain_info_imperfect_output.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[ | ||
{ | ||
"name": "Immune Response" | ||
}, | ||
{ | ||
"name": "Proteostasis" | ||
}, | ||
{ | ||
"name": "Tau Homeostasis" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
import pandas as pd | ||
import pytest | ||
|
||
from agoradatatools.etl.transform import biodomain_info | ||
|
||
|
||
class TestTransformBiodomainInfo: | ||
data_files_path = "tests/test_assets/biodomain_info" | ||
pass_test_data = [ | ||
( # Pass with good data | ||
"biodomain_info_good_input.csv", | ||
"biodomain_info_good_output.json", | ||
), | ||
( # Pass with values missing from each column | ||
"biodomain_info_imperfect_input.csv", | ||
"biodomain_info_imperfect_output.json", | ||
), | ||
] | ||
pass_test_ids = [ | ||
"Pass with good data", | ||
"Pass with missing values in each column", | ||
] | ||
fail_test_data = [ | ||
# No failure cases for this transform | ||
] | ||
fail_test_ids = [ | ||
# No failure cases for this transform | ||
] | ||
|
||
@pytest.mark.parametrize( | ||
"biodomain_info_file, expected_output_file", pass_test_data, ids=pass_test_ids | ||
) | ||
def test_transform_biodomain_info_should_pass( | ||
self, biodomain_info_file, expected_output_file | ||
): | ||
biodomain_info_df = pd.read_csv( | ||
os.path.join(self.data_files_path, "input", biodomain_info_file) | ||
) | ||
output_df = biodomain_info.transform_biodomain_info( | ||
datasets={"genes_biodomains": biodomain_info_df} | ||
) | ||
expected_df = pd.read_json( | ||
os.path.join(self.data_files_path, "output", expected_output_file) | ||
) | ||
pd.testing.assert_frame_equal(output_df, expected_df) | ||
|
||
""" | ||
# Leaving code stub for failure case, in case we want to add this in the future | ||
@pytest.mark.parametrize("biodomain_info_file", fail_test_data, ids=fail_test_ids) | ||
def test_transform_biodomain_info_should_fail(self, biodomain_info_file): | ||
with pytest.raises(<Error type>): | ||
biodomain_info_df = pd.read_csv(os.path.join(self.data_files_path, "input", biodomain_info_file)) | ||
biodomain_info.transform_biodomain_info( | ||
datasets={"genes_biodomains": biodomain_info_df} | ||
) | ||
""" |