From 57f4916e92ec95a81b21ca4deb39f128f2ef6381 Mon Sep 17 00:00:00 2001 From: Milton Pividori Date: Mon, 11 Sep 2023 22:02:50 -0600 Subject: [PATCH] ccc pvalue: add notebook to compute CCC/Pearson/Spearman pvalues --- .../15-compute_pvalues_from_samples.ipynb | 1258 +++++++++++++++++ .../py/15-compute_pvalues_from_samples.py | 176 +++ 2 files changed, 1434 insertions(+) create mode 100644 nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb create mode 100644 nbs/25_pvalue/py/15-compute_pvalues_from_samples.py diff --git a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb new file mode 100644 index 00000000..6582ba60 --- /dev/null +++ b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb @@ -0,0 +1,1258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ec52faa3-656f-483e-9617-d7ec0f7d818c", + "metadata": { + "papermill": { + "duration": 0.005592, + "end_time": "2023-09-11T19:45:04.267196", + "exception": false, + "start_time": "2023-09-11T19:45:04.261604", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "51102f42-fcd9-4a58-9c8d-dfcd3d2d464e", + "metadata": { + "papermill": { + "duration": 0.005669, + "end_time": "2023-09-11T19:45:04.283039", + "exception": false, + "start_time": "2023-09-11T19:45:04.277370", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "7006ceeb-2651-407d-bfa1-1039727649ef", + "metadata": { + "papermill": { + "duration": 0.004573, + "end_time": "2023-09-11T19:45:04.292325", + "exception": false, + "start_time": "2023-09-11T19:45:04.287752", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules loading" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1ffa1a96-7545-40b9-ac8b-8627e13de8d4", + "metadata": { + "papermill": { + "duration": 0.520306, + "end_time": "2023-09-11T19:45:04.817368", + "exception": false, + "start_time": "2023-09-11T19:45:04.297062", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from scipy import stats\n", + "import numpy as np\n", + "import pandas as pd\n", + "from concurrent.futures import as_completed, ProcessPoolExecutor\n", + "\n", + "from ccc.coef import ccc\n", + "from ccc import conf" + ] + }, + { + "cell_type": "markdown", + "id": "0d3cc810-4b17-4213-8f03-6fe7e97a0fe3", + "metadata": { + "papermill": { + "duration": 0.004597, + "end_time": "2023-09-11T19:45:04.827424", + "exception": false, + "start_time": "2023-09-11T19:45:04.822827", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9a154623-c787-4a31-871a-cad173f0eb9f", + "metadata": { + "papermill": { + "duration": 0.009038, + "end_time": "2023-09-11T19:45:04.841189", + "exception": false, + "start_time": "2023-09-11T19:45:04.832151", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "DATASET_CONFIG = conf.GTEX\n", + "GTEX_TISSUE = \"whole_blood\"\n", + "GENE_SEL_STRATEGY = \"var_pc_log2\"\n", + "\n", + "PVALUE_N_PERMS = 100000\n", + "\n", + "RANDOM_STATE = np.random.RandomState(0)" + ] + }, + { + "cell_type": "markdown", + "id": "5b09ff83-5377-49a9-b24b-65c6c90277d6", + "metadata": { + "papermill": { + "duration": 0.004576, + "end_time": "2023-09-11T19:45:04.850505", + "exception": false, + "start_time": "2023-09-11T19:45:04.845929", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c6f73068-fa38-44be-bd0c-708f6ff450ea", + "metadata": { + "papermill": { + "duration": 0.010711, + "end_time": "2023-09-11T19:45:04.866030", + "exception": false, + "start_time": "2023-09-11T19:45:04.855319", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "INPUT_GENE_EXPR_FILE = (\n", + " DATASET_CONFIG[\"GENE_SELECTION_DIR\"]\n", + " / f\"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl\"\n", + ")\n", + "display(INPUT_GENE_EXPR_FILE)\n", + "\n", + "assert INPUT_GENE_EXPR_FILE.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "30cce6f5-ca1b-438c-859d-31903a42d4c6", + "metadata": { + "papermill": { + "duration": 0.011695, + "end_time": "2023-09-11T19:45:04.882830", + "exception": false, + "start_time": "2023-09-11T19:45:04.871135", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (\n", + " DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"]\n", + " / f\"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl\"\n", + ")\n", + "display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)\n", + "\n", + "assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0122253c-99c0-41e2-8807-60df86bf0619", + "metadata": { + "papermill": { + "duration": 0.009909, + "end_time": "2023-09-11T19:45:04.898238", + "exception": false, + "start_time": "2023-09-11T19:45:04.888329", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "OUTPUT_DIR = DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"] / \"pvalues\"\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3003ed2c-5da0-43b9-969d-9cf037d05730", + "metadata": { + "papermill": { + "duration": 0.009376, + "end_time": "2023-09-11T19:45:04.912198", + "exception": false, + "start_time": "2023-09-11T19:45:04.902822", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "OUTPUT_DIR" + ] + }, + { + "cell_type": "markdown", + "id": "3d014f0c-d442-48ab-add8-ac338ad15b27", + "metadata": { + "papermill": { + "duration": 0.004041, + "end_time": "2023-09-11T19:45:04.920486", + "exception": false, + "start_time": "2023-09-11T19:45:04.916445", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Load gene expression data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6e8ef201-6f98-4fb6-a306-180ed4b467db", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4d18e93e-b394-46bd-8d16-d9261a85ba06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5000, 755)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ea8947b9-9064-43ec-bf10-6e6ae361c451", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GTEX-111YS-0006-SM-5NQBEGTEX-1122O-0005-SM-5O99JGTEX-1128S-0005-SM-5P9HIGTEX-113IC-0006-SM-5NQ9CGTEX-113JC-0006-SM-5O997GTEX-117XS-0005-SM-5PNU6GTEX-117YW-0005-SM-5NQ8ZGTEX-1192W-0005-SM-5NQBQGTEX-1192X-0005-SM-5NQC3GTEX-11DXW-0006-SM-5NQ7Y...GTEX-ZVE2-0006-SM-51MRWGTEX-ZVP2-0005-SM-51MRKGTEX-ZVT2-0005-SM-57WBWGTEX-ZVT3-0006-SM-51MT9GTEX-ZVT4-0006-SM-57WB8GTEX-ZVTK-0006-SM-57WBKGTEX-ZVZP-0006-SM-51MSWGTEX-ZVZQ-0006-SM-51MR8GTEX-ZXES-0005-SM-57WCBGTEX-ZXG5-0005-SM-57WCN
gene_ens_id
ENSG00000000419.1220.650025.0507.15549.1306.1474.14305.3904.3891.15806.8240...4.407032.34018.68009.2517.8287.46033.24005.848025.76017.080
ENSG00000000938.12906.00001344.000633.500719.200392.600166.5000338.200413.20051.5400423.6000...354.80001102.000774.9000206.000620.400346.3001304.0000232.9000631.600884.500
ENSG00000001167.148.190020.01020.47021.22016.4608.619018.22016.5801.602035.6800...11.340011.25011.18009.52341.86024.5808.892013.390013.47042.640
ENSG00000001561.60.71041.7712.2346.0143.2060.39622.4451.4180.55310.7447...0.92692.5550.59763.4172.6451.8830.53910.98161.0366.729
ENSG00000002549.1222.500021.33019.290157.10029.3309.577014.17023.3301.407028.3000...4.493050.47016.210032.74018.15011.92020.100015.550011.98035.370
\n", + "

5 rows × 755 columns

\n", + "
" + ], + "text/plain": [ + " GTEX-111YS-0006-SM-5NQBE GTEX-1122O-0005-SM-5O99J \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 20.6500 25.050 \n", + "ENSG00000000938.12 906.0000 1344.000 \n", + "ENSG00000001167.14 8.1900 20.010 \n", + "ENSG00000001561.6 0.7104 1.771 \n", + "ENSG00000002549.12 22.5000 21.330 \n", + "\n", + " GTEX-1128S-0005-SM-5P9HI GTEX-113IC-0006-SM-5NQ9C \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 7.155 49.130 \n", + "ENSG00000000938.12 633.500 719.200 \n", + "ENSG00000001167.14 20.470 21.220 \n", + "ENSG00000001561.6 2.234 6.014 \n", + "ENSG00000002549.12 19.290 157.100 \n", + "\n", + " GTEX-113JC-0006-SM-5O997 GTEX-117XS-0005-SM-5PNU6 \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 6.147 4.1430 \n", + "ENSG00000000938.12 392.600 166.5000 \n", + "ENSG00000001167.14 16.460 8.6190 \n", + "ENSG00000001561.6 3.206 0.3962 \n", + "ENSG00000002549.12 29.330 9.5770 \n", + "\n", + " GTEX-117YW-0005-SM-5NQ8Z GTEX-1192W-0005-SM-5NQBQ \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 5.390 4.389 \n", + "ENSG00000000938.12 338.200 413.200 \n", + "ENSG00000001167.14 18.220 16.580 \n", + "ENSG00000001561.6 2.445 1.418 \n", + "ENSG00000002549.12 14.170 23.330 \n", + "\n", + " GTEX-1192X-0005-SM-5NQC3 GTEX-11DXW-0006-SM-5NQ7Y ... \\\n", + "gene_ens_id ... \n", + "ENSG00000000419.12 1.1580 6.8240 ... \n", + "ENSG00000000938.12 51.5400 423.6000 ... \n", + "ENSG00000001167.14 1.6020 35.6800 ... \n", + "ENSG00000001561.6 0.5531 0.7447 ... \n", + "ENSG00000002549.12 1.4070 28.3000 ... \n", + "\n", + " GTEX-ZVE2-0006-SM-51MRW GTEX-ZVP2-0005-SM-51MRK \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 4.4070 32.340 \n", + "ENSG00000000938.12 354.8000 1102.000 \n", + "ENSG00000001167.14 11.3400 11.250 \n", + "ENSG00000001561.6 0.9269 2.555 \n", + "ENSG00000002549.12 4.4930 50.470 \n", + "\n", + " GTEX-ZVT2-0005-SM-57WBW GTEX-ZVT3-0006-SM-51MT9 \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 18.6800 9.251 \n", + "ENSG00000000938.12 774.9000 206.000 \n", + "ENSG00000001167.14 11.1800 9.523 \n", + "ENSG00000001561.6 0.5976 3.417 \n", + "ENSG00000002549.12 16.2100 32.740 \n", + "\n", + " GTEX-ZVT4-0006-SM-57WB8 GTEX-ZVTK-0006-SM-57WBK \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 7.828 7.460 \n", + "ENSG00000000938.12 620.400 346.300 \n", + "ENSG00000001167.14 41.860 24.580 \n", + "ENSG00000001561.6 2.645 1.883 \n", + "ENSG00000002549.12 18.150 11.920 \n", + "\n", + " GTEX-ZVZP-0006-SM-51MSW GTEX-ZVZQ-0006-SM-51MR8 \\\n", + "gene_ens_id \n", + "ENSG00000000419.12 33.2400 5.8480 \n", + "ENSG00000000938.12 1304.0000 232.9000 \n", + "ENSG00000001167.14 8.8920 13.3900 \n", + "ENSG00000001561.6 0.5391 0.9816 \n", + "ENSG00000002549.12 20.1000 15.5500 \n", + "\n", + " GTEX-ZXES-0005-SM-57WCB GTEX-ZXG5-0005-SM-57WCN \n", + "gene_ens_id \n", + "ENSG00000000419.12 25.760 17.080 \n", + "ENSG00000000938.12 631.600 884.500 \n", + "ENSG00000001167.14 13.470 42.640 \n", + "ENSG00000001561.6 1.036 6.729 \n", + "ENSG00000002549.12 11.980 35.370 \n", + "\n", + "[5 rows x 755 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "5414e2d4-b4c5-48d9-9dd4-d9ff1585f341", + "metadata": { + "papermill": { + "duration": 0.004041, + "end_time": "2023-09-11T19:45:04.920486", + "exception": false, + "start_time": "2023-09-11T19:45:04.916445", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Load gene pairs samples" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "178a09a8-1a2e-425a-8a52-773f41c72633", + "metadata": {}, + "outputs": [], + "source": [ + "output_file = OUTPUT_DIR / \"gene_pair-samples.pkl\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c42a9f4c-3672-4ab0-b9ff-c214eb40cd2f", + "metadata": {}, + "outputs": [], + "source": [ + "gene_pair_samples = pd.read_pickle(output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1724d63c-19eb-49a8-83fc-6c8b07585e98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(gene_pair_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "99f5098f-aa01-471b-a6a2-5aabc332176b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['all_high',\n", + " 'all_low',\n", + " 'ccc_high_and_pearson_low',\n", + " 'ccc_high_and_spearman_low',\n", + " 'ccc_high_and_spearman_pearson_low',\n", + " 'ccc_spearman_high_and_pearson_low',\n", + " 'pearson_high_and_ccc_low',\n", + " 'pearson_high_and_ccc_spearman_low',\n", + " 'selected_in_manuscript']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(gene_pair_samples.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c60378f6-3f87-49d4-8b86-cf3ec30fc545", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Pearson (high)Pearson (low)Spearman (high)Spearman (low)Clustermatch (high)Clustermatch (low)cccpearsonspearman
ENSG00000052749.13ENSG00000165025.14TrueFalseTrueFalseTrueFalse0.3623400.7094490.795566
ENSG00000102897.9ENSG00000086544.2TrueFalseTrueFalseTrueFalse0.4290920.6985370.822212
ENSG00000110628.13ENSG00000267078.1TrueFalseTrueFalseTrueFalse0.2301430.5094990.632816
ENSG00000169554.18ENSG00000132424.14TrueFalseTrueFalseTrueFalse0.5090120.7737620.878352
ENSG00000143933.16ENSG00000135378.3TrueFalseTrueFalseTrueFalse0.4718420.5311210.819382
\n", + "
" + ], + "text/plain": [ + " Pearson (high) Pearson (low) \\\n", + "ENSG00000052749.13 ENSG00000165025.14 True False \n", + "ENSG00000102897.9 ENSG00000086544.2 True False \n", + "ENSG00000110628.13 ENSG00000267078.1 True False \n", + "ENSG00000169554.18 ENSG00000132424.14 True False \n", + "ENSG00000143933.16 ENSG00000135378.3 True False \n", + "\n", + " Spearman (high) Spearman (low) \\\n", + "ENSG00000052749.13 ENSG00000165025.14 True False \n", + "ENSG00000102897.9 ENSG00000086544.2 True False \n", + "ENSG00000110628.13 ENSG00000267078.1 True False \n", + "ENSG00000169554.18 ENSG00000132424.14 True False \n", + "ENSG00000143933.16 ENSG00000135378.3 True False \n", + "\n", + " Clustermatch (high) \\\n", + "ENSG00000052749.13 ENSG00000165025.14 True \n", + "ENSG00000102897.9 ENSG00000086544.2 True \n", + "ENSG00000110628.13 ENSG00000267078.1 True \n", + "ENSG00000169554.18 ENSG00000132424.14 True \n", + "ENSG00000143933.16 ENSG00000135378.3 True \n", + "\n", + " Clustermatch (low) ccc pearson \\\n", + "ENSG00000052749.13 ENSG00000165025.14 False 0.362340 0.709449 \n", + "ENSG00000102897.9 ENSG00000086544.2 False 0.429092 0.698537 \n", + "ENSG00000110628.13 ENSG00000267078.1 False 0.230143 0.509499 \n", + "ENSG00000169554.18 ENSG00000132424.14 False 0.509012 0.773762 \n", + "ENSG00000143933.16 ENSG00000135378.3 False 0.471842 0.531121 \n", + "\n", + " spearman \n", + "ENSG00000052749.13 ENSG00000165025.14 0.795566 \n", + "ENSG00000102897.9 ENSG00000086544.2 0.822212 \n", + "ENSG00000110628.13 ENSG00000267078.1 0.632816 \n", + "ENSG00000169554.18 ENSG00000132424.14 0.878352 \n", + "ENSG00000143933.16 ENSG00000135378.3 0.819382 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gene_pair_samples[\"all_high\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6ccae66e-e276-43c3-809c-512aa0fe795b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('ENSG00000052749.13', 'ENSG00000165025.14'),\n", + " ('ENSG00000102897.9', 'ENSG00000086544.2'),\n", + " ('ENSG00000110628.13', 'ENSG00000267078.1'),\n", + " ('ENSG00000169554.18', 'ENSG00000132424.14'),\n", + " ('ENSG00000143933.16', 'ENSG00000135378.3'),\n", + " ('ENSG00000170776.21', 'ENSG00000155903.11'),\n", + " ('ENSG00000136111.12', 'ENSG00000065911.11'),\n", + " ('ENSG00000131042.14', 'ENSG00000141367.11'),\n", + " ('ENSG00000160703.15', 'ENSG00000231964.1'),\n", + " ('ENSG00000008394.12', 'ENSG00000101347.8')]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[i for i in gene_pair_samples[\"all_high\"].head(10).index]" + ] + }, + { + "cell_type": "markdown", + "id": "6402879c-e9a0-414b-b60b-9e4ed1e9e99e", + "metadata": {}, + "source": [ + "# Compute pvalues on sampled gene pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "62d8632e-13e0-4a78-ad30-26770172d21e", + "metadata": { + "papermill": { + "duration": 0.011689, + "end_time": "2023-09-11T19:45:05.843928", + "exception": false, + "start_time": "2023-09-11T19:45:05.832239", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output_file = OUTPUT_DIR / \"gene_pair-samples-pvalues.pkl\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c8a85ce0-4c5a-4ed9-8ad6-24b21fb10b1e", + "metadata": { + "papermill": { + "duration": 0.003646, + "end_time": "2023-09-11T17:55:28.650841", + "exception": false, + "start_time": "2023-09-11T17:55:28.647195", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def corr_single(x, y):\n", + " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)\n", + " p_val, p_pval = stats.pearsonr(x, y)\n", + " s_val, s_pval = stats.spearmanr(x, y)\n", + " \n", + " return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d9838801-1f01-4316-8e29-ffedbdc2a67a", + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "\n", + "with ProcessPoolExecutor(max_workers=conf.GENERAL[\"N_JOBS\"]) as executor:\n", + " tasks = {\n", + " executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (gene0, gene1, k)\n", + " for k, v in gene_pair_samples.items()\n", + " for gene0, gene1 in gene_pair_samples[k].index\n", + " }\n", + "\n", + " for t_idx, t in enumerate(as_completed(tasks)):\n", + " gene0, gene1, k = tasks[t]\n", + " ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()\n", + "\n", + " results.append({\n", + " \"gene0\": gene0,\n", + " \"gene1\": gene1,\n", + " \"group\": k,\n", + " \"ccc\": ccc_val,\n", + " \"ccc_pvalue\": ccc_pval,\n", + " \"pearson\": p_val,\n", + " \"pearson_pvalue\": p_pval,\n", + " \"spearman\": s_val,\n", + " \"spearman_pvalue\": s_pval,\n", + " })\n", + " \n", + " if t_idx % 10:\n", + " _df = pd.DataFrame(results)\n", + " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n", + " _df.to_pickle(output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6f32ad1a-3b2f-4e08-8a53-35cfb68e3970", + "metadata": { + "papermill": { + "duration": 432.020716, + "end_time": "2023-09-11T18:02:40.673133", + "exception": false, + "start_time": "2023-09-11T17:55:28.652417", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "644" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "e68a65a5-8bba-4a79-a740-26d722dc670e", + "metadata": { + "papermill": { + "duration": 0.012915, + "end_time": "2023-09-11T18:02:40.688462", + "exception": false, + "start_time": "2023-09-11T18:02:40.675547", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results_df = pd.DataFrame(results)\n", + "results_df[\"group\"] = results_df[\"group\"].astype(\"category\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9514ebb1-f1c1-46d9-96b6-a2264e3a6b4b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(644, 9)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "6110dd19-95e0-4400-847a-424a498fa63d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene0gene1groupcccccc_pvaluepearsonpearson_pvaluespearmanspearman_pvalue
0ENSG00000170776.21ENSG00000155903.11all_high0.3249870.0099010.7513374.609357e-1380.7697466.110239e-149
1ENSG00000169554.18ENSG00000132424.14all_high0.5090120.0099010.7737621.893487e-1510.8783521.374455e-243
2ENSG00000102978.12ENSG00000144224.16all_high0.1819400.0099010.4813084.903005e-450.5860148.034343e-71
3ENSG00000135828.11ENSG00000250138.4all_high0.3883660.0099010.7518432.382590e-1380.7947991.719432e-165
4ENSG00000174705.12ENSG00000164105.3all_high0.2617620.0099010.5869374.312390e-710.6747042.085742e-101
\n", + "
" + ], + "text/plain": [ + " gene0 gene1 group ccc ccc_pvalue \\\n", + "0 ENSG00000170776.21 ENSG00000155903.11 all_high 0.324987 0.009901 \n", + "1 ENSG00000169554.18 ENSG00000132424.14 all_high 0.509012 0.009901 \n", + "2 ENSG00000102978.12 ENSG00000144224.16 all_high 0.181940 0.009901 \n", + "3 ENSG00000135828.11 ENSG00000250138.4 all_high 0.388366 0.009901 \n", + "4 ENSG00000174705.12 ENSG00000164105.3 all_high 0.261762 0.009901 \n", + "\n", + " pearson pearson_pvalue spearman spearman_pvalue \n", + "0 0.751337 4.609357e-138 0.769746 6.110239e-149 \n", + "1 0.773762 1.893487e-151 0.878352 1.374455e-243 \n", + "2 0.481308 4.903005e-45 0.586014 8.034343e-71 \n", + "3 0.751843 2.382590e-138 0.794799 1.719432e-165 \n", + "4 0.586937 4.312390e-71 0.674704 2.085742e-101 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c15b6534-bd10-4091-a33d-e924a883cd93", + "metadata": { + "papermill": { + "duration": 0.007585, + "end_time": "2023-09-11T19:45:05.824409", + "exception": false, + "start_time": "2023-09-11T19:45:05.816824", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Save" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bb8e28d4-3adf-4d6a-a94e-81b6763ebd61", + "metadata": { + "papermill": { + "duration": 0.012605, + "end_time": "2023-09-11T19:45:05.864400", + "exception": false, + "start_time": "2023-09-11T19:45:05.851795", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results_df.to_pickle(output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46cff789-6c90-42c5-9328-21ee7bd70851", + "metadata": { + "papermill": { + "duration": 0.007756, + "end_time": "2023-09-11T19:45:05.880049", + "exception": false, + "start_time": "2023-09-11T19:45:05.872293", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted", + "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version", + "text_representation": { + "extension": ".py", + "format_name": "percent", + "format_version": "1.3" + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 2.811804, + "end_time": "2023-09-11T19:45:06.202793", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/25_pvalue/10-sample_gene_pair_categories.ipynb", + "output_path": "nbs/25_pvalue/10-sample_gene_pair_categories.run.ipynb", + "parameters": {}, + "start_time": "2023-09-11T19:45:03.390989", + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py new file mode 100644 index 00000000..e506215d --- /dev/null +++ b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py @@ -0,0 +1,176 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all,-execution,-papermill,-trusted +# notebook_metadata_filter: -jupytext.text_representation.jupytext_version +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] tags=[] +# # Description + +# %% [markdown] tags=[] +# TODO + +# %% [markdown] tags=[] +# # Modules loading + +# %% tags=[] +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats +import numpy as np +import pandas as pd +from concurrent.futures import as_completed, ProcessPoolExecutor + +from ccc.coef import ccc +from ccc import conf + +# %% [markdown] tags=[] +# # Settings + +# %% tags=[] +DATASET_CONFIG = conf.GTEX +GTEX_TISSUE = "whole_blood" +GENE_SEL_STRATEGY = "var_pc_log2" + +PVALUE_N_PERMS = 100000 + +RANDOM_STATE = np.random.RandomState(0) + +# %% [markdown] tags=[] +# # Paths + +# %% tags=[] +INPUT_GENE_EXPR_FILE = ( + DATASET_CONFIG["GENE_SELECTION_DIR"] + / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl" +) +display(INPUT_GENE_EXPR_FILE) + +assert INPUT_GENE_EXPR_FILE.exists() + +# %% tags=[] +INPUT_GENE_PAIRS_INTERSECTIONS_FILE = ( + DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] + / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl" +) +display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE) + +assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists() + +# %% tags=[] +OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# %% tags=[] +OUTPUT_DIR + +# %% [markdown] tags=[] +# # Load gene expression data + +# %% +data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index() + +# %% +data.shape + +# %% +data.head() + +# %% [markdown] tags=[] +# # Load gene pairs samples + +# %% +output_file = OUTPUT_DIR / "gene_pair-samples.pkl" + +# %% +gene_pair_samples = pd.read_pickle(output_file) + +# %% +len(gene_pair_samples) + +# %% +sorted(gene_pair_samples.keys()) + +# %% +gene_pair_samples["all_high"].head() + +# %% +[i for i in gene_pair_samples["all_high"].head(10).index] + +# %% [markdown] +# # Compute pvalues on sampled gene pairs + +# %% tags=[] +output_file = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl" + + +# %% tags=[] +def corr_single(x, y): + ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1) + p_val, p_pval = stats.pearsonr(x, y) + s_val, s_pval = stats.spearmanr(x, y) + + return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval + + +# %% +results = [] + +with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor: + tasks = { + executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (gene0, gene1, k) + for k, v in gene_pair_samples.items() + for gene0, gene1 in gene_pair_samples[k].index + } + + for t_idx, t in enumerate(as_completed(tasks)): + gene0, gene1, k = tasks[t] + ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result() + + results.append({ + "gene0": gene0, + "gene1": gene1, + "group": k, + "ccc": ccc_val, + "ccc_pvalue": ccc_pval, + "pearson": p_val, + "pearson_pvalue": p_pval, + "spearman": s_val, + "spearman_pvalue": s_pval, + }) + + if t_idx % 10: + _df = pd.DataFrame(results) + _df["group"] = _df["group"].astype("category") + _df.to_pickle(output_file) + +# %% tags=[] +len(results) + +# %% tags=[] +results_df = pd.DataFrame(results) +results_df["group"] = results_df["group"].astype("category") + +# %% +results_df.shape + +# %% +results_df.head() + +# %% [markdown] tags=[] +# # Save + +# %% tags=[] +results_df.to_pickle(output_file) + +# %% tags=[]