diff --git a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb
new file mode 100644
index 00000000..6582ba60
--- /dev/null
+++ b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb
@@ -0,0 +1,1258 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ec52faa3-656f-483e-9617-d7ec0f7d818c",
+ "metadata": {
+ "papermill": {
+ "duration": 0.005592,
+ "end_time": "2023-09-11T19:45:04.267196",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.261604",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Description"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51102f42-fcd9-4a58-9c8d-dfcd3d2d464e",
+ "metadata": {
+ "papermill": {
+ "duration": 0.005669,
+ "end_time": "2023-09-11T19:45:04.283039",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.277370",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7006ceeb-2651-407d-bfa1-1039727649ef",
+ "metadata": {
+ "papermill": {
+ "duration": 0.004573,
+ "end_time": "2023-09-11T19:45:04.292325",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.287752",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Modules loading"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "1ffa1a96-7545-40b9-ac8b-8627e13de8d4",
+ "metadata": {
+ "papermill": {
+ "duration": 0.520306,
+ "end_time": "2023-09-11T19:45:04.817368",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.297062",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from scipy import stats\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from concurrent.futures import as_completed, ProcessPoolExecutor\n",
+ "\n",
+ "from ccc.coef import ccc\n",
+ "from ccc import conf"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0d3cc810-4b17-4213-8f03-6fe7e97a0fe3",
+ "metadata": {
+ "papermill": {
+ "duration": 0.004597,
+ "end_time": "2023-09-11T19:45:04.827424",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.822827",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Settings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9a154623-c787-4a31-871a-cad173f0eb9f",
+ "metadata": {
+ "papermill": {
+ "duration": 0.009038,
+ "end_time": "2023-09-11T19:45:04.841189",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.832151",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "DATASET_CONFIG = conf.GTEX\n",
+ "GTEX_TISSUE = \"whole_blood\"\n",
+ "GENE_SEL_STRATEGY = \"var_pc_log2\"\n",
+ "\n",
+ "PVALUE_N_PERMS = 100000\n",
+ "\n",
+ "RANDOM_STATE = np.random.RandomState(0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5b09ff83-5377-49a9-b24b-65c6c90277d6",
+ "metadata": {
+ "papermill": {
+ "duration": 0.004576,
+ "end_time": "2023-09-11T19:45:04.850505",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.845929",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Paths"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c6f73068-fa38-44be-bd0c-708f6ff450ea",
+ "metadata": {
+ "papermill": {
+ "duration": 0.010711,
+ "end_time": "2023-09-11T19:45:04.866030",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.855319",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "INPUT_GENE_EXPR_FILE = (\n",
+ " DATASET_CONFIG[\"GENE_SELECTION_DIR\"]\n",
+ " / f\"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl\"\n",
+ ")\n",
+ "display(INPUT_GENE_EXPR_FILE)\n",
+ "\n",
+ "assert INPUT_GENE_EXPR_FILE.exists()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "30cce6f5-ca1b-438c-859d-31903a42d4c6",
+ "metadata": {
+ "papermill": {
+ "duration": 0.011695,
+ "end_time": "2023-09-11T19:45:04.882830",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.871135",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (\n",
+ " DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"]\n",
+ " / f\"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl\"\n",
+ ")\n",
+ "display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)\n",
+ "\n",
+ "assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "0122253c-99c0-41e2-8807-60df86bf0619",
+ "metadata": {
+ "papermill": {
+ "duration": 0.009909,
+ "end_time": "2023-09-11T19:45:04.898238",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.888329",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "OUTPUT_DIR = DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"] / \"pvalues\"\n",
+ "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "3003ed2c-5da0-43b9-969d-9cf037d05730",
+ "metadata": {
+ "papermill": {
+ "duration": 0.009376,
+ "end_time": "2023-09-11T19:45:04.912198",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.902822",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "OUTPUT_DIR"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3d014f0c-d442-48ab-add8-ac338ad15b27",
+ "metadata": {
+ "papermill": {
+ "duration": 0.004041,
+ "end_time": "2023-09-11T19:45:04.920486",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.916445",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Load gene expression data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "6e8ef201-6f98-4fb6-a306-180ed4b467db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "4d18e93e-b394-46bd-8d16-d9261a85ba06",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5000, 755)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ea8947b9-9064-43ec-bf10-6e6ae361c451",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " GTEX-111YS-0006-SM-5NQBE | \n",
+ " GTEX-1122O-0005-SM-5O99J | \n",
+ " GTEX-1128S-0005-SM-5P9HI | \n",
+ " GTEX-113IC-0006-SM-5NQ9C | \n",
+ " GTEX-113JC-0006-SM-5O997 | \n",
+ " GTEX-117XS-0005-SM-5PNU6 | \n",
+ " GTEX-117YW-0005-SM-5NQ8Z | \n",
+ " GTEX-1192W-0005-SM-5NQBQ | \n",
+ " GTEX-1192X-0005-SM-5NQC3 | \n",
+ " GTEX-11DXW-0006-SM-5NQ7Y | \n",
+ " ... | \n",
+ " GTEX-ZVE2-0006-SM-51MRW | \n",
+ " GTEX-ZVP2-0005-SM-51MRK | \n",
+ " GTEX-ZVT2-0005-SM-57WBW | \n",
+ " GTEX-ZVT3-0006-SM-51MT9 | \n",
+ " GTEX-ZVT4-0006-SM-57WB8 | \n",
+ " GTEX-ZVTK-0006-SM-57WBK | \n",
+ " GTEX-ZVZP-0006-SM-51MSW | \n",
+ " GTEX-ZVZQ-0006-SM-51MR8 | \n",
+ " GTEX-ZXES-0005-SM-57WCB | \n",
+ " GTEX-ZXG5-0005-SM-57WCN | \n",
+ "
\n",
+ " \n",
+ " gene_ens_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSG00000000419.12 | \n",
+ " 20.6500 | \n",
+ " 25.050 | \n",
+ " 7.155 | \n",
+ " 49.130 | \n",
+ " 6.147 | \n",
+ " 4.1430 | \n",
+ " 5.390 | \n",
+ " 4.389 | \n",
+ " 1.1580 | \n",
+ " 6.8240 | \n",
+ " ... | \n",
+ " 4.4070 | \n",
+ " 32.340 | \n",
+ " 18.6800 | \n",
+ " 9.251 | \n",
+ " 7.828 | \n",
+ " 7.460 | \n",
+ " 33.2400 | \n",
+ " 5.8480 | \n",
+ " 25.760 | \n",
+ " 17.080 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000000938.12 | \n",
+ " 906.0000 | \n",
+ " 1344.000 | \n",
+ " 633.500 | \n",
+ " 719.200 | \n",
+ " 392.600 | \n",
+ " 166.5000 | \n",
+ " 338.200 | \n",
+ " 413.200 | \n",
+ " 51.5400 | \n",
+ " 423.6000 | \n",
+ " ... | \n",
+ " 354.8000 | \n",
+ " 1102.000 | \n",
+ " 774.9000 | \n",
+ " 206.000 | \n",
+ " 620.400 | \n",
+ " 346.300 | \n",
+ " 1304.0000 | \n",
+ " 232.9000 | \n",
+ " 631.600 | \n",
+ " 884.500 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000001167.14 | \n",
+ " 8.1900 | \n",
+ " 20.010 | \n",
+ " 20.470 | \n",
+ " 21.220 | \n",
+ " 16.460 | \n",
+ " 8.6190 | \n",
+ " 18.220 | \n",
+ " 16.580 | \n",
+ " 1.6020 | \n",
+ " 35.6800 | \n",
+ " ... | \n",
+ " 11.3400 | \n",
+ " 11.250 | \n",
+ " 11.1800 | \n",
+ " 9.523 | \n",
+ " 41.860 | \n",
+ " 24.580 | \n",
+ " 8.8920 | \n",
+ " 13.3900 | \n",
+ " 13.470 | \n",
+ " 42.640 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000001561.6 | \n",
+ " 0.7104 | \n",
+ " 1.771 | \n",
+ " 2.234 | \n",
+ " 6.014 | \n",
+ " 3.206 | \n",
+ " 0.3962 | \n",
+ " 2.445 | \n",
+ " 1.418 | \n",
+ " 0.5531 | \n",
+ " 0.7447 | \n",
+ " ... | \n",
+ " 0.9269 | \n",
+ " 2.555 | \n",
+ " 0.5976 | \n",
+ " 3.417 | \n",
+ " 2.645 | \n",
+ " 1.883 | \n",
+ " 0.5391 | \n",
+ " 0.9816 | \n",
+ " 1.036 | \n",
+ " 6.729 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000002549.12 | \n",
+ " 22.5000 | \n",
+ " 21.330 | \n",
+ " 19.290 | \n",
+ " 157.100 | \n",
+ " 29.330 | \n",
+ " 9.5770 | \n",
+ " 14.170 | \n",
+ " 23.330 | \n",
+ " 1.4070 | \n",
+ " 28.3000 | \n",
+ " ... | \n",
+ " 4.4930 | \n",
+ " 50.470 | \n",
+ " 16.2100 | \n",
+ " 32.740 | \n",
+ " 18.150 | \n",
+ " 11.920 | \n",
+ " 20.1000 | \n",
+ " 15.5500 | \n",
+ " 11.980 | \n",
+ " 35.370 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 755 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " GTEX-111YS-0006-SM-5NQBE GTEX-1122O-0005-SM-5O99J \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 20.6500 25.050 \n",
+ "ENSG00000000938.12 906.0000 1344.000 \n",
+ "ENSG00000001167.14 8.1900 20.010 \n",
+ "ENSG00000001561.6 0.7104 1.771 \n",
+ "ENSG00000002549.12 22.5000 21.330 \n",
+ "\n",
+ " GTEX-1128S-0005-SM-5P9HI GTEX-113IC-0006-SM-5NQ9C \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 7.155 49.130 \n",
+ "ENSG00000000938.12 633.500 719.200 \n",
+ "ENSG00000001167.14 20.470 21.220 \n",
+ "ENSG00000001561.6 2.234 6.014 \n",
+ "ENSG00000002549.12 19.290 157.100 \n",
+ "\n",
+ " GTEX-113JC-0006-SM-5O997 GTEX-117XS-0005-SM-5PNU6 \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 6.147 4.1430 \n",
+ "ENSG00000000938.12 392.600 166.5000 \n",
+ "ENSG00000001167.14 16.460 8.6190 \n",
+ "ENSG00000001561.6 3.206 0.3962 \n",
+ "ENSG00000002549.12 29.330 9.5770 \n",
+ "\n",
+ " GTEX-117YW-0005-SM-5NQ8Z GTEX-1192W-0005-SM-5NQBQ \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 5.390 4.389 \n",
+ "ENSG00000000938.12 338.200 413.200 \n",
+ "ENSG00000001167.14 18.220 16.580 \n",
+ "ENSG00000001561.6 2.445 1.418 \n",
+ "ENSG00000002549.12 14.170 23.330 \n",
+ "\n",
+ " GTEX-1192X-0005-SM-5NQC3 GTEX-11DXW-0006-SM-5NQ7Y ... \\\n",
+ "gene_ens_id ... \n",
+ "ENSG00000000419.12 1.1580 6.8240 ... \n",
+ "ENSG00000000938.12 51.5400 423.6000 ... \n",
+ "ENSG00000001167.14 1.6020 35.6800 ... \n",
+ "ENSG00000001561.6 0.5531 0.7447 ... \n",
+ "ENSG00000002549.12 1.4070 28.3000 ... \n",
+ "\n",
+ " GTEX-ZVE2-0006-SM-51MRW GTEX-ZVP2-0005-SM-51MRK \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 4.4070 32.340 \n",
+ "ENSG00000000938.12 354.8000 1102.000 \n",
+ "ENSG00000001167.14 11.3400 11.250 \n",
+ "ENSG00000001561.6 0.9269 2.555 \n",
+ "ENSG00000002549.12 4.4930 50.470 \n",
+ "\n",
+ " GTEX-ZVT2-0005-SM-57WBW GTEX-ZVT3-0006-SM-51MT9 \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 18.6800 9.251 \n",
+ "ENSG00000000938.12 774.9000 206.000 \n",
+ "ENSG00000001167.14 11.1800 9.523 \n",
+ "ENSG00000001561.6 0.5976 3.417 \n",
+ "ENSG00000002549.12 16.2100 32.740 \n",
+ "\n",
+ " GTEX-ZVT4-0006-SM-57WB8 GTEX-ZVTK-0006-SM-57WBK \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 7.828 7.460 \n",
+ "ENSG00000000938.12 620.400 346.300 \n",
+ "ENSG00000001167.14 41.860 24.580 \n",
+ "ENSG00000001561.6 2.645 1.883 \n",
+ "ENSG00000002549.12 18.150 11.920 \n",
+ "\n",
+ " GTEX-ZVZP-0006-SM-51MSW GTEX-ZVZQ-0006-SM-51MR8 \\\n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 33.2400 5.8480 \n",
+ "ENSG00000000938.12 1304.0000 232.9000 \n",
+ "ENSG00000001167.14 8.8920 13.3900 \n",
+ "ENSG00000001561.6 0.5391 0.9816 \n",
+ "ENSG00000002549.12 20.1000 15.5500 \n",
+ "\n",
+ " GTEX-ZXES-0005-SM-57WCB GTEX-ZXG5-0005-SM-57WCN \n",
+ "gene_ens_id \n",
+ "ENSG00000000419.12 25.760 17.080 \n",
+ "ENSG00000000938.12 631.600 884.500 \n",
+ "ENSG00000001167.14 13.470 42.640 \n",
+ "ENSG00000001561.6 1.036 6.729 \n",
+ "ENSG00000002549.12 11.980 35.370 \n",
+ "\n",
+ "[5 rows x 755 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5414e2d4-b4c5-48d9-9dd4-d9ff1585f341",
+ "metadata": {
+ "papermill": {
+ "duration": 0.004041,
+ "end_time": "2023-09-11T19:45:04.920486",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:04.916445",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Load gene pairs samples"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "178a09a8-1a2e-425a-8a52-773f41c72633",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_file = OUTPUT_DIR / \"gene_pair-samples.pkl\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "c42a9f4c-3672-4ab0-b9ff-c214eb40cd2f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gene_pair_samples = pd.read_pickle(output_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1724d63c-19eb-49a8-83fc-6c8b07585e98",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(gene_pair_samples)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "99f5098f-aa01-471b-a6a2-5aabc332176b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['all_high',\n",
+ " 'all_low',\n",
+ " 'ccc_high_and_pearson_low',\n",
+ " 'ccc_high_and_spearman_low',\n",
+ " 'ccc_high_and_spearman_pearson_low',\n",
+ " 'ccc_spearman_high_and_pearson_low',\n",
+ " 'pearson_high_and_ccc_low',\n",
+ " 'pearson_high_and_ccc_spearman_low',\n",
+ " 'selected_in_manuscript']"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sorted(gene_pair_samples.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "c60378f6-3f87-49d4-8b86-cf3ec30fc545",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " Pearson (high) | \n",
+ " Pearson (low) | \n",
+ " Spearman (high) | \n",
+ " Spearman (low) | \n",
+ " Clustermatch (high) | \n",
+ " Clustermatch (low) | \n",
+ " ccc | \n",
+ " pearson | \n",
+ " spearman | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSG00000052749.13 | \n",
+ " ENSG00000165025.14 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.362340 | \n",
+ " 0.709449 | \n",
+ " 0.795566 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000102897.9 | \n",
+ " ENSG00000086544.2 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.429092 | \n",
+ " 0.698537 | \n",
+ " 0.822212 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000110628.13 | \n",
+ " ENSG00000267078.1 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.230143 | \n",
+ " 0.509499 | \n",
+ " 0.632816 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000169554.18 | \n",
+ " ENSG00000132424.14 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.509012 | \n",
+ " 0.773762 | \n",
+ " 0.878352 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000143933.16 | \n",
+ " ENSG00000135378.3 | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.471842 | \n",
+ " 0.531121 | \n",
+ " 0.819382 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pearson (high) Pearson (low) \\\n",
+ "ENSG00000052749.13 ENSG00000165025.14 True False \n",
+ "ENSG00000102897.9 ENSG00000086544.2 True False \n",
+ "ENSG00000110628.13 ENSG00000267078.1 True False \n",
+ "ENSG00000169554.18 ENSG00000132424.14 True False \n",
+ "ENSG00000143933.16 ENSG00000135378.3 True False \n",
+ "\n",
+ " Spearman (high) Spearman (low) \\\n",
+ "ENSG00000052749.13 ENSG00000165025.14 True False \n",
+ "ENSG00000102897.9 ENSG00000086544.2 True False \n",
+ "ENSG00000110628.13 ENSG00000267078.1 True False \n",
+ "ENSG00000169554.18 ENSG00000132424.14 True False \n",
+ "ENSG00000143933.16 ENSG00000135378.3 True False \n",
+ "\n",
+ " Clustermatch (high) \\\n",
+ "ENSG00000052749.13 ENSG00000165025.14 True \n",
+ "ENSG00000102897.9 ENSG00000086544.2 True \n",
+ "ENSG00000110628.13 ENSG00000267078.1 True \n",
+ "ENSG00000169554.18 ENSG00000132424.14 True \n",
+ "ENSG00000143933.16 ENSG00000135378.3 True \n",
+ "\n",
+ " Clustermatch (low) ccc pearson \\\n",
+ "ENSG00000052749.13 ENSG00000165025.14 False 0.362340 0.709449 \n",
+ "ENSG00000102897.9 ENSG00000086544.2 False 0.429092 0.698537 \n",
+ "ENSG00000110628.13 ENSG00000267078.1 False 0.230143 0.509499 \n",
+ "ENSG00000169554.18 ENSG00000132424.14 False 0.509012 0.773762 \n",
+ "ENSG00000143933.16 ENSG00000135378.3 False 0.471842 0.531121 \n",
+ "\n",
+ " spearman \n",
+ "ENSG00000052749.13 ENSG00000165025.14 0.795566 \n",
+ "ENSG00000102897.9 ENSG00000086544.2 0.822212 \n",
+ "ENSG00000110628.13 ENSG00000267078.1 0.632816 \n",
+ "ENSG00000169554.18 ENSG00000132424.14 0.878352 \n",
+ "ENSG00000143933.16 ENSG00000135378.3 0.819382 "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gene_pair_samples[\"all_high\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "6ccae66e-e276-43c3-809c-512aa0fe795b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('ENSG00000052749.13', 'ENSG00000165025.14'),\n",
+ " ('ENSG00000102897.9', 'ENSG00000086544.2'),\n",
+ " ('ENSG00000110628.13', 'ENSG00000267078.1'),\n",
+ " ('ENSG00000169554.18', 'ENSG00000132424.14'),\n",
+ " ('ENSG00000143933.16', 'ENSG00000135378.3'),\n",
+ " ('ENSG00000170776.21', 'ENSG00000155903.11'),\n",
+ " ('ENSG00000136111.12', 'ENSG00000065911.11'),\n",
+ " ('ENSG00000131042.14', 'ENSG00000141367.11'),\n",
+ " ('ENSG00000160703.15', 'ENSG00000231964.1'),\n",
+ " ('ENSG00000008394.12', 'ENSG00000101347.8')]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[i for i in gene_pair_samples[\"all_high\"].head(10).index]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6402879c-e9a0-414b-b60b-9e4ed1e9e99e",
+ "metadata": {},
+ "source": [
+ "# Compute pvalues on sampled gene pairs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "62d8632e-13e0-4a78-ad30-26770172d21e",
+ "metadata": {
+ "papermill": {
+ "duration": 0.011689,
+ "end_time": "2023-09-11T19:45:05.843928",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:05.832239",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "output_file = OUTPUT_DIR / \"gene_pair-samples-pvalues.pkl\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "c8a85ce0-4c5a-4ed9-8ad6-24b21fb10b1e",
+ "metadata": {
+ "papermill": {
+ "duration": 0.003646,
+ "end_time": "2023-09-11T17:55:28.650841",
+ "exception": false,
+ "start_time": "2023-09-11T17:55:28.647195",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def corr_single(x, y):\n",
+ " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)\n",
+ " p_val, p_pval = stats.pearsonr(x, y)\n",
+ " s_val, s_pval = stats.spearmanr(x, y)\n",
+ " \n",
+ " return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "d9838801-1f01-4316-8e29-ffedbdc2a67a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = []\n",
+ "\n",
+ "with ProcessPoolExecutor(max_workers=conf.GENERAL[\"N_JOBS\"]) as executor:\n",
+ " tasks = {\n",
+ " executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (gene0, gene1, k)\n",
+ " for k, v in gene_pair_samples.items()\n",
+ " for gene0, gene1 in gene_pair_samples[k].index\n",
+ " }\n",
+ "\n",
+ " for t_idx, t in enumerate(as_completed(tasks)):\n",
+ " gene0, gene1, k = tasks[t]\n",
+ " ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()\n",
+ "\n",
+ " results.append({\n",
+ " \"gene0\": gene0,\n",
+ " \"gene1\": gene1,\n",
+ " \"group\": k,\n",
+ " \"ccc\": ccc_val,\n",
+ " \"ccc_pvalue\": ccc_pval,\n",
+ " \"pearson\": p_val,\n",
+ " \"pearson_pvalue\": p_pval,\n",
+ " \"spearman\": s_val,\n",
+ " \"spearman_pvalue\": s_pval,\n",
+ " })\n",
+ " \n",
+ " if t_idx % 10:\n",
+ " _df = pd.DataFrame(results)\n",
+ " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n",
+ " _df.to_pickle(output_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "6f32ad1a-3b2f-4e08-8a53-35cfb68e3970",
+ "metadata": {
+ "papermill": {
+ "duration": 432.020716,
+ "end_time": "2023-09-11T18:02:40.673133",
+ "exception": false,
+ "start_time": "2023-09-11T17:55:28.652417",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "644"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(results)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "e68a65a5-8bba-4a79-a740-26d722dc670e",
+ "metadata": {
+ "papermill": {
+ "duration": 0.012915,
+ "end_time": "2023-09-11T18:02:40.688462",
+ "exception": false,
+ "start_time": "2023-09-11T18:02:40.675547",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "results_df = pd.DataFrame(results)\n",
+ "results_df[\"group\"] = results_df[\"group\"].astype(\"category\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "9514ebb1-f1c1-46d9-96b6-a2264e3a6b4b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(644, 9)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "6110dd19-95e0-4400-847a-424a498fa63d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " gene0 | \n",
+ " gene1 | \n",
+ " group | \n",
+ " ccc | \n",
+ " ccc_pvalue | \n",
+ " pearson | \n",
+ " pearson_pvalue | \n",
+ " spearman | \n",
+ " spearman_pvalue | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSG00000170776.21 | \n",
+ " ENSG00000155903.11 | \n",
+ " all_high | \n",
+ " 0.324987 | \n",
+ " 0.009901 | \n",
+ " 0.751337 | \n",
+ " 4.609357e-138 | \n",
+ " 0.769746 | \n",
+ " 6.110239e-149 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSG00000169554.18 | \n",
+ " ENSG00000132424.14 | \n",
+ " all_high | \n",
+ " 0.509012 | \n",
+ " 0.009901 | \n",
+ " 0.773762 | \n",
+ " 1.893487e-151 | \n",
+ " 0.878352 | \n",
+ " 1.374455e-243 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSG00000102978.12 | \n",
+ " ENSG00000144224.16 | \n",
+ " all_high | \n",
+ " 0.181940 | \n",
+ " 0.009901 | \n",
+ " 0.481308 | \n",
+ " 4.903005e-45 | \n",
+ " 0.586014 | \n",
+ " 8.034343e-71 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSG00000135828.11 | \n",
+ " ENSG00000250138.4 | \n",
+ " all_high | \n",
+ " 0.388366 | \n",
+ " 0.009901 | \n",
+ " 0.751843 | \n",
+ " 2.382590e-138 | \n",
+ " 0.794799 | \n",
+ " 1.719432e-165 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSG00000174705.12 | \n",
+ " ENSG00000164105.3 | \n",
+ " all_high | \n",
+ " 0.261762 | \n",
+ " 0.009901 | \n",
+ " 0.586937 | \n",
+ " 4.312390e-71 | \n",
+ " 0.674704 | \n",
+ " 2.085742e-101 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " gene0 gene1 group ccc ccc_pvalue \\\n",
+ "0 ENSG00000170776.21 ENSG00000155903.11 all_high 0.324987 0.009901 \n",
+ "1 ENSG00000169554.18 ENSG00000132424.14 all_high 0.509012 0.009901 \n",
+ "2 ENSG00000102978.12 ENSG00000144224.16 all_high 0.181940 0.009901 \n",
+ "3 ENSG00000135828.11 ENSG00000250138.4 all_high 0.388366 0.009901 \n",
+ "4 ENSG00000174705.12 ENSG00000164105.3 all_high 0.261762 0.009901 \n",
+ "\n",
+ " pearson pearson_pvalue spearman spearman_pvalue \n",
+ "0 0.751337 4.609357e-138 0.769746 6.110239e-149 \n",
+ "1 0.773762 1.893487e-151 0.878352 1.374455e-243 \n",
+ "2 0.481308 4.903005e-45 0.586014 8.034343e-71 \n",
+ "3 0.751843 2.382590e-138 0.794799 1.719432e-165 \n",
+ "4 0.586937 4.312390e-71 0.674704 2.085742e-101 "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c15b6534-bd10-4091-a33d-e924a883cd93",
+ "metadata": {
+ "papermill": {
+ "duration": 0.007585,
+ "end_time": "2023-09-11T19:45:05.824409",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:05.816824",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "source": [
+ "# Save"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "bb8e28d4-3adf-4d6a-a94e-81b6763ebd61",
+ "metadata": {
+ "papermill": {
+ "duration": 0.012605,
+ "end_time": "2023-09-11T19:45:05.864400",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:05.851795",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "results_df.to_pickle(output_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "46cff789-6c90-42c5-9328-21ee7bd70851",
+ "metadata": {
+ "papermill": {
+ "duration": 0.007756,
+ "end_time": "2023-09-11T19:45:05.880049",
+ "exception": false,
+ "start_time": "2023-09-11T19:45:05.872293",
+ "status": "completed"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "cell_metadata_filter": "all,-execution,-papermill,-trusted",
+ "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version",
+ "text_representation": {
+ "extension": ".py",
+ "format_name": "percent",
+ "format_version": "1.3"
+ }
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "papermill": {
+ "default_parameters": {},
+ "duration": 2.811804,
+ "end_time": "2023-09-11T19:45:06.202793",
+ "environment_variables": {},
+ "exception": null,
+ "input_path": "nbs/25_pvalue/10-sample_gene_pair_categories.ipynb",
+ "output_path": "nbs/25_pvalue/10-sample_gene_pair_categories.run.ipynb",
+ "parameters": {},
+ "start_time": "2023-09-11T19:45:03.390989",
+ "version": "2.3.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py
new file mode 100644
index 00000000..e506215d
--- /dev/null
+++ b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py
@@ -0,0 +1,176 @@
+# ---
+# jupyter:
+# jupytext:
+# cell_metadata_filter: all,-execution,-papermill,-trusted
+# notebook_metadata_filter: -jupytext.text_representation.jupytext_version
+# text_representation:
+# extension: .py
+# format_name: percent
+# format_version: '1.3'
+# kernelspec:
+# display_name: Python 3 (ipykernel)
+# language: python
+# name: python3
+# ---
+
+# %% [markdown] tags=[]
+# # Description
+
+# %% [markdown] tags=[]
+# TODO
+
+# %% [markdown] tags=[]
+# # Modules loading
+
+# %% tags=[]
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+import numpy as np
+import pandas as pd
+from concurrent.futures import as_completed, ProcessPoolExecutor
+
+from ccc.coef import ccc
+from ccc import conf
+
+# %% [markdown] tags=[]
+# # Settings
+
+# %% tags=[]
+DATASET_CONFIG = conf.GTEX
+GTEX_TISSUE = "whole_blood"
+GENE_SEL_STRATEGY = "var_pc_log2"
+
+PVALUE_N_PERMS = 100000
+
+RANDOM_STATE = np.random.RandomState(0)
+
+# %% [markdown] tags=[]
+# # Paths
+
+# %% tags=[]
+INPUT_GENE_EXPR_FILE = (
+ DATASET_CONFIG["GENE_SELECTION_DIR"]
+ / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
+)
+display(INPUT_GENE_EXPR_FILE)
+
+assert INPUT_GENE_EXPR_FILE.exists()
+
+# %% tags=[]
+INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
+ DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
+ / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
+)
+display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)
+
+assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()
+
+# %% tags=[]
+OUTPUT_DIR = DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"] / "pvalues"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# %% tags=[]
+OUTPUT_DIR
+
+# %% [markdown] tags=[]
+# # Load gene expression data
+
+# %%
+data = pd.read_pickle(INPUT_GENE_EXPR_FILE).sort_index()
+
+# %%
+data.shape
+
+# %%
+data.head()
+
+# %% [markdown] tags=[]
+# # Load gene pairs samples
+
+# %%
+output_file = OUTPUT_DIR / "gene_pair-samples.pkl"
+
+# %%
+gene_pair_samples = pd.read_pickle(output_file)
+
+# %%
+len(gene_pair_samples)
+
+# %%
+sorted(gene_pair_samples.keys())
+
+# %%
+gene_pair_samples["all_high"].head()
+
+# %%
+[i for i in gene_pair_samples["all_high"].head(10).index]
+
+# %% [markdown]
+# # Compute pvalues on sampled gene pairs
+
+# %% tags=[]
+output_file = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"
+
+
+# %% tags=[]
+def corr_single(x, y):
+ ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)
+ p_val, p_pval = stats.pearsonr(x, y)
+ s_val, s_pval = stats.spearmanr(x, y)
+
+ return ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval
+
+
+# %%
+results = []
+
+with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
+ tasks = {
+ executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (gene0, gene1, k)
+ for k, v in gene_pair_samples.items()
+ for gene0, gene1 in gene_pair_samples[k].index
+ }
+
+ for t_idx, t in enumerate(as_completed(tasks)):
+ gene0, gene1, k = tasks[t]
+ ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()
+
+ results.append({
+ "gene0": gene0,
+ "gene1": gene1,
+ "group": k,
+ "ccc": ccc_val,
+ "ccc_pvalue": ccc_pval,
+ "pearson": p_val,
+ "pearson_pvalue": p_pval,
+ "spearman": s_val,
+ "spearman_pvalue": s_pval,
+ })
+
+ if t_idx % 10:
+ _df = pd.DataFrame(results)
+ _df["group"] = _df["group"].astype("category")
+ _df.to_pickle(output_file)
+
+# %% tags=[]
+len(results)
+
+# %% tags=[]
+results_df = pd.DataFrame(results)
+results_df["group"] = results_df["group"].astype("category")
+
+# %%
+results_df.shape
+
+# %%
+results_df.head()
+
+# %% [markdown] tags=[]
+# # Save
+
+# %% tags=[]
+results_df.to_pickle(output_file)
+
+# %% tags=[]