From 9e596384ff9d4d54c0c981284e8dee8a86a083fb Mon Sep 17 00:00:00 2001 From: Milton Pividori Date: Thu, 4 Jan 2024 00:17:55 -0700 Subject: [PATCH] intro/relationships: add simulated categorical datasets --- .../intro/05-relationships_analysis.ipynb | 1500 ++++++++++++++--- .../intro/py/05-relationships_analysis.py | 360 +++- 2 files changed, 1616 insertions(+), 244 deletions(-) diff --git a/nbs/99_manuscript/intro/05-relationships_analysis.ipynb b/nbs/99_manuscript/intro/05-relationships_analysis.ipynb index c4f8b4c7..0cd57347 100644 --- a/nbs/99_manuscript/intro/05-relationships_analysis.ipynb +++ b/nbs/99_manuscript/intro/05-relationships_analysis.ipynb @@ -5,10 +5,10 @@ "id": "4efdf9ad-f1b0-40a4-a033-4bf93f7ad030", "metadata": { "papermill": { - "duration": 0.007677, - "end_time": "2023-12-04T18:41:43.832846", + "duration": 0.012313, + "end_time": "2024-01-04T08:12:40.644846", "exception": false, - "start_time": "2023-12-04T18:41:43.825169", + "start_time": "2024-01-04T08:12:40.632533", "status": "completed" }, "tags": [] @@ -22,10 +22,10 @@ "id": "c8e1cfb2-bb21-40ab-9fd9-03c3458c0ab4", "metadata": { "papermill": { - "duration": 0.007517, - "end_time": "2023-12-04T18:41:43.850875", + "duration": 0.010089, + "end_time": "2024-01-04T08:12:40.665589", "exception": false, - "start_time": "2023-12-04T18:41:43.843358", + "start_time": "2024-01-04T08:12:40.655500", "status": "completed" }, "tags": [] @@ -39,10 +39,10 @@ "id": "8b7de09b-03ff-445a-b460-9a32fe7c70ad", "metadata": { "papermill": { - "duration": 0.00619, - "end_time": "2023-12-04T18:41:43.863421", + "duration": 0.011739, + "end_time": "2024-01-04T08:12:40.688154", "exception": false, - "start_time": "2023-12-04T18:41:43.857231", + "start_time": "2024-01-04T08:12:40.676415", "status": "completed" }, "tags": [] @@ -57,16 +57,16 @@ "id": "e63984b9-d85c-4ec2-854a-eea10d9f2cad", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:43.877292Z", - "iopub.status.busy": "2023-12-04T18:41:43.877068Z", - "iopub.status.idle": "2023-12-04T18:41:44.675411Z", - "shell.execute_reply": "2023-12-04T18:41:44.674973Z" + "iopub.execute_input": "2024-01-04T08:12:40.710175Z", + "iopub.status.busy": "2024-01-04T08:12:40.709812Z", + "iopub.status.idle": "2024-01-04T08:12:41.528473Z", + "shell.execute_reply": "2024-01-04T08:12:41.528021Z" }, "papermill": { - "duration": 0.807389, - "end_time": "2023-12-04T18:41:44.677165", + "duration": 0.832168, + "end_time": "2024-01-04T08:12:41.530292", "exception": false, - "start_time": "2023-12-04T18:41:43.869776", + "start_time": "2024-01-04T08:12:40.698124", "status": "completed" }, "tags": [] @@ -90,16 +90,16 @@ "id": "18bb0c93-55f7-433e-8358-d076df5e4124", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.691458Z", - "iopub.status.busy": "2023-12-04T18:41:44.691351Z", - "iopub.status.idle": "2023-12-04T18:41:44.694470Z", - "shell.execute_reply": "2023-12-04T18:41:44.694095Z" + "iopub.execute_input": "2024-01-04T08:12:41.552054Z", + "iopub.status.busy": "2024-01-04T08:12:41.551934Z", + "iopub.status.idle": "2024-01-04T08:12:41.555324Z", + "shell.execute_reply": "2024-01-04T08:12:41.554931Z" }, "papermill": { - "duration": 0.011612, - "end_time": "2023-12-04T18:41:44.695737", + "duration": 0.016063, + "end_time": "2024-01-04T08:12:41.556668", "exception": false, - "start_time": "2023-12-04T18:41:44.684125", + "start_time": "2024-01-04T08:12:41.540605", "status": "completed" }, "tags": [] @@ -120,10 +120,10 @@ "id": "0619629b-09ba-462b-be5b-ad48ff7e8ffa", "metadata": { "papermill": { - "duration": 0.006635, - "end_time": "2023-12-04T18:41:44.709108", + "duration": 0.013537, + "end_time": "2024-01-04T08:12:41.581843", "exception": false, - "start_time": "2023-12-04T18:41:44.702473", + "start_time": "2024-01-04T08:12:41.568306", "status": "completed" }, "tags": [] @@ -132,15 +132,40 @@ "# Settings" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "950a4c5b-b29c-4db3-8fdf-605a412fb069", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:12:41.604217Z", + "iopub.status.busy": "2024-01-04T08:12:41.604025Z", + "iopub.status.idle": "2024-01-04T08:12:41.606370Z", + "shell.execute_reply": "2024-01-04T08:12:41.605985Z" + }, + "papermill": { + "duration": 0.014616, + "end_time": "2024-01-04T08:12:41.607819", + "exception": false, + "start_time": "2024-01-04T08:12:41.593203", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "PVALUE_N_PERMS = 10000" + ] + }, { "cell_type": "markdown", "id": "a629452d-4419-4114-b7a2-176d99eaad5c", "metadata": { "papermill": { - "duration": 0.006592, - "end_time": "2023-12-04T18:41:44.722355", + "duration": 0.008343, + "end_time": "2024-01-04T08:12:41.626157", "exception": false, - "start_time": "2023-12-04T18:41:44.715763", + "start_time": "2024-01-04T08:12:41.617814", "status": "completed" }, "tags": [] @@ -151,20 +176,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "1dbdd918-9637-40a5-89d7-f40a741d681a", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.736209Z", - "iopub.status.busy": "2023-12-04T18:41:44.736107Z", - "iopub.status.idle": "2023-12-04T18:41:44.738507Z", - "shell.execute_reply": "2023-12-04T18:41:44.738116Z" + "iopub.execute_input": "2024-01-04T08:12:41.637383Z", + "iopub.status.busy": "2024-01-04T08:12:41.637259Z", + "iopub.status.idle": "2024-01-04T08:12:41.640101Z", + "shell.execute_reply": "2024-01-04T08:12:41.639634Z" }, "papermill": { - "duration": 0.010721, - "end_time": "2023-12-04T18:41:44.739755", + "duration": 0.009656, + "end_time": "2024-01-04T08:12:41.641173", "exception": false, - "start_time": "2023-12-04T18:41:44.729034", + "start_time": "2024-01-04T08:12:41.631517", "status": "completed" }, "tags": [] @@ -178,20 +203,20 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "0328bab2-98c9-4578-9349-3d281792de1a", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.753520Z", - "iopub.status.busy": "2023-12-04T18:41:44.753131Z", - "iopub.status.idle": "2023-12-04T18:41:44.759145Z", - "shell.execute_reply": "2023-12-04T18:41:44.758656Z" + "iopub.execute_input": "2024-01-04T08:12:41.654084Z", + "iopub.status.busy": "2024-01-04T08:12:41.653892Z", + "iopub.status.idle": "2024-01-04T08:12:41.660863Z", + "shell.execute_reply": "2024-01-04T08:12:41.660382Z" }, "papermill": { - "duration": 0.014186, - "end_time": "2023-12-04T18:41:44.760411", + "duration": 0.01466, + "end_time": "2024-01-04T08:12:41.661704", "exception": false, - "start_time": "2023-12-04T18:41:44.746225", + "start_time": "2024-01-04T08:12:41.647044", "status": "completed" }, "tags": [] @@ -218,16 +243,16 @@ "id": "8a2c4b73-af23-44c8-8b30-b297ed3530e2", "metadata": { "papermill": { - "duration": 0.006261, - "end_time": "2023-12-04T18:41:44.773278", + "duration": 0.005236, + "end_time": "2024-01-04T08:12:41.672322", "exception": false, - "start_time": "2023-12-04T18:41:44.767017", + "start_time": "2024-01-04T08:12:41.667086", "status": "completed" }, "tags": [] }, "source": [ - "# Generate datasets" + "# Numerical datasets" ] }, { @@ -235,10 +260,10 @@ "id": "c785968f-e38f-4c53-b853-f3d686ad363c", "metadata": { "papermill": { - "duration": 0.004646, - "end_time": "2023-12-04T18:41:44.784368", + "duration": 0.00526, + "end_time": "2024-01-04T08:12:41.682903", "exception": false, - "start_time": "2023-12-04T18:41:44.779722", + "start_time": "2024-01-04T08:12:41.677643", "status": "completed" }, "tags": [] @@ -249,20 +274,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "87900265-e80f-44ea-9803-671b3bc692e2", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.792089Z", - "iopub.status.busy": "2023-12-04T18:41:44.791688Z", - "iopub.status.idle": "2023-12-04T18:41:44.797303Z", - "shell.execute_reply": "2023-12-04T18:41:44.796815Z" + "iopub.execute_input": "2024-01-04T08:12:41.714813Z", + "iopub.status.busy": "2024-01-04T08:12:41.714459Z", + "iopub.status.idle": "2024-01-04T08:12:41.720116Z", + "shell.execute_reply": "2024-01-04T08:12:41.719649Z" }, "papermill": { - "duration": 0.010373, - "end_time": "2023-12-04T18:41:44.798126", + "duration": 0.012636, + "end_time": "2024-01-04T08:12:41.721093", "exception": false, - "start_time": "2023-12-04T18:41:44.787753", + "start_time": "2024-01-04T08:12:41.708457", "status": "completed" }, "tags": [] @@ -274,20 +299,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "a20cf193-ebd6-4af5-a0ff-fdff19d05146", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.805992Z", - "iopub.status.busy": "2023-12-04T18:41:44.805615Z", - "iopub.status.idle": "2023-12-04T18:41:44.809484Z", - "shell.execute_reply": "2023-12-04T18:41:44.809023Z" + "iopub.execute_input": "2024-01-04T08:12:41.732661Z", + "iopub.status.busy": "2024-01-04T08:12:41.732306Z", + "iopub.status.idle": "2024-01-04T08:12:41.736564Z", + "shell.execute_reply": "2024-01-04T08:12:41.736058Z" }, "papermill": { - "duration": 0.008677, - "end_time": "2023-12-04T18:41:44.810285", + "duration": 0.011015, + "end_time": "2024-01-04T08:12:41.737447", "exception": false, - "start_time": "2023-12-04T18:41:44.801608", + "start_time": "2024-01-04T08:12:41.726432", "status": "completed" }, "tags": [] @@ -299,7 +324,7 @@ "(44, 3)" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -310,20 +335,20 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "322bb7eb-24e8-4976-b740-9af3bdf5c9f0", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.818162Z", - "iopub.status.busy": "2023-12-04T18:41:44.817749Z", - "iopub.status.idle": "2023-12-04T18:41:44.826754Z", - "shell.execute_reply": "2023-12-04T18:41:44.826265Z" + "iopub.execute_input": "2024-01-04T08:12:41.749278Z", + "iopub.status.busy": "2024-01-04T08:12:41.748850Z", + "iopub.status.idle": "2024-01-04T08:12:41.758390Z", + "shell.execute_reply": "2024-01-04T08:12:41.757899Z" }, "papermill": { - "duration": 0.013819, - "end_time": "2023-12-04T18:41:44.827562", + "duration": 0.016335, + "end_time": "2024-01-04T08:12:41.759180", "exception": false, - "start_time": "2023-12-04T18:41:44.813743", + "start_time": "2024-01-04T08:12:41.742845", "status": "completed" }, "tags": [] @@ -399,7 +424,7 @@ "4 I 11.0 8.33" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -410,20 +435,20 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "148acbe5-e354-4c36-98d9-57f66e6742f7", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.835820Z", - "iopub.status.busy": "2023-12-04T18:41:44.835366Z", - "iopub.status.idle": "2023-12-04T18:41:44.839525Z", - "shell.execute_reply": "2023-12-04T18:41:44.839031Z" + "iopub.execute_input": "2024-01-04T08:12:41.771039Z", + "iopub.status.busy": "2024-01-04T08:12:41.770639Z", + "iopub.status.idle": "2024-01-04T08:12:41.774489Z", + "shell.execute_reply": "2024-01-04T08:12:41.773972Z" }, "papermill": { - "duration": 0.009232, - "end_time": "2023-12-04T18:41:44.840343", + "duration": 0.010637, + "end_time": "2024-01-04T08:12:41.775283", "exception": false, - "start_time": "2023-12-04T18:41:44.831111", + "start_time": "2024-01-04T08:12:41.764646", "status": "completed" }, "tags": [] @@ -437,20 +462,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "b32b2f78-5052-448b-beda-38f424d9875f", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.848851Z", - "iopub.status.busy": "2023-12-04T18:41:44.848514Z", - "iopub.status.idle": "2023-12-04T18:41:44.862392Z", - "shell.execute_reply": "2023-12-04T18:41:44.861883Z" + "iopub.execute_input": "2024-01-04T08:12:41.787114Z", + "iopub.status.busy": "2024-01-04T08:12:41.786758Z", + "iopub.status.idle": "2024-01-04T08:12:41.799930Z", + "shell.execute_reply": "2024-01-04T08:12:41.799451Z" }, "papermill": { - "duration": 0.019103, - "end_time": "2023-12-04T18:41:44.863256", + "duration": 0.019993, + "end_time": "2024-01-04T08:12:41.800711", "exception": false, - "start_time": "2023-12-04T18:41:44.844153", + "start_time": "2024-01-04T08:12:41.780718", "status": "completed" }, "tags": [] @@ -538,7 +563,7 @@ "max 19.000000 12.740000" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -549,20 +574,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "95a9db8b-29e9-4c59-9826-c728c7b9180b", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.871658Z", - "iopub.status.busy": "2023-12-04T18:41:44.871238Z", - "iopub.status.idle": "2023-12-04T18:41:44.874189Z", - "shell.execute_reply": "2023-12-04T18:41:44.873720Z" + "iopub.execute_input": "2024-01-04T08:12:41.812824Z", + "iopub.status.busy": "2024-01-04T08:12:41.812405Z", + "iopub.status.idle": "2024-01-04T08:12:41.815582Z", + "shell.execute_reply": "2024-01-04T08:12:41.815083Z" }, "papermill": { - "duration": 0.007947, - "end_time": "2023-12-04T18:41:44.874996", + "duration": 0.010137, + "end_time": "2024-01-04T08:12:41.816408", "exception": false, - "start_time": "2023-12-04T18:41:44.867049", + "start_time": "2024-01-04T08:12:41.806271", "status": "completed" }, "tags": [] @@ -578,10 +603,10 @@ "id": "df0bea14-821a-4f07-b2a0-e956ae138c09", "metadata": { "papermill": { - "duration": 0.003541, - "end_time": "2023-12-04T18:41:44.882155", + "duration": 0.005471, + "end_time": "2024-01-04T08:12:41.827459", "exception": false, - "start_time": "2023-12-04T18:41:44.878614", + "start_time": "2024-01-04T08:12:41.821988", "status": "completed" }, "tags": [] @@ -592,20 +617,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "912943d2-a487-48f5-9b92-83242fa2f8c2", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.890260Z", - "iopub.status.busy": "2023-12-04T18:41:44.889926Z", - "iopub.status.idle": "2023-12-04T18:41:44.892630Z", - "shell.execute_reply": "2023-12-04T18:41:44.892159Z" + "iopub.execute_input": "2024-01-04T08:12:41.839478Z", + "iopub.status.busy": "2024-01-04T08:12:41.839150Z", + "iopub.status.idle": "2024-01-04T08:12:41.841845Z", + "shell.execute_reply": "2024-01-04T08:12:41.841383Z" }, "papermill": { - "duration": 0.007637, - "end_time": "2023-12-04T18:41:44.893430", + "duration": 0.009633, + "end_time": "2024-01-04T08:12:41.842662", "exception": false, - "start_time": "2023-12-04T18:41:44.885793", + "start_time": "2024-01-04T08:12:41.833029", "status": "completed" }, "tags": [] @@ -617,20 +642,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "7b79e345-3dd0-48cf-9dee-2fb45822b074", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.901548Z", - "iopub.status.busy": "2023-12-04T18:41:44.901223Z", - "iopub.status.idle": "2023-12-04T18:41:44.910192Z", - "shell.execute_reply": "2023-12-04T18:41:44.909665Z" + "iopub.execute_input": "2024-01-04T08:12:41.854966Z", + "iopub.status.busy": "2024-01-04T08:12:41.854476Z", + "iopub.status.idle": "2024-01-04T08:12:41.870879Z", + "shell.execute_reply": "2024-01-04T08:12:41.870191Z" }, "papermill": { - "duration": 0.014111, - "end_time": "2023-12-04T18:41:44.911192", + "duration": 0.0237, + "end_time": "2024-01-04T08:12:41.871927", "exception": false, - "start_time": "2023-12-04T18:41:44.897081", + "start_time": "2024-01-04T08:12:41.848227", "status": "completed" }, "tags": [] @@ -665,10 +690,10 @@ "id": "869da4d1-0816-4ded-940f-8eada4532490", "metadata": { "papermill": { - "duration": 0.003497, - "end_time": "2023-12-04T18:41:44.918558", + "duration": 0.005563, + "end_time": "2024-01-04T08:12:41.883484", "exception": false, - "start_time": "2023-12-04T18:41:44.915061", + "start_time": "2024-01-04T08:12:41.877921", "status": "completed" }, "tags": [] @@ -679,20 +704,20 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "c7f7a4aa-d9eb-4917-beb8-59d91947b97f", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.926919Z", - "iopub.status.busy": "2023-12-04T18:41:44.926545Z", - "iopub.status.idle": "2023-12-04T18:41:44.929416Z", - "shell.execute_reply": "2023-12-04T18:41:44.928940Z" + "iopub.execute_input": "2024-01-04T08:12:41.896018Z", + "iopub.status.busy": "2024-01-04T08:12:41.895723Z", + "iopub.status.idle": "2024-01-04T08:12:41.898787Z", + "shell.execute_reply": "2024-01-04T08:12:41.898330Z" }, "papermill": { - "duration": 0.008054, - "end_time": "2023-12-04T18:41:44.930237", + "duration": 0.010359, + "end_time": "2024-01-04T08:12:41.899569", "exception": false, - "start_time": "2023-12-04T18:41:44.922183", + "start_time": "2024-01-04T08:12:41.889210", "status": "completed" }, "tags": [] @@ -704,20 +729,20 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "d345cc0c-3e75-4fcb-9556-f8369b734449", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.938520Z", - "iopub.status.busy": "2023-12-04T18:41:44.938191Z", - "iopub.status.idle": "2023-12-04T18:41:44.948149Z", - "shell.execute_reply": "2023-12-04T18:41:44.947609Z" + "iopub.execute_input": "2024-01-04T08:12:41.911643Z", + "iopub.status.busy": "2024-01-04T08:12:41.911286Z", + "iopub.status.idle": "2024-01-04T08:12:41.921892Z", + "shell.execute_reply": "2024-01-04T08:12:41.921465Z" }, "papermill": { - "duration": 0.015181, - "end_time": "2023-12-04T18:41:44.949200", + "duration": 0.017563, + "end_time": "2024-01-04T08:12:41.922686", "exception": false, - "start_time": "2023-12-04T18:41:44.934019", + "start_time": "2024-01-04T08:12:41.905123", "status": "completed" }, "tags": [] @@ -760,10 +785,10 @@ "id": "689579f3-f2cb-40d4-b913-378217f94acb", "metadata": { "papermill": { - "duration": 0.003493, - "end_time": "2023-12-04T18:41:44.956712", + "duration": 0.005492, + "end_time": "2024-01-04T08:12:41.933781", "exception": false, - "start_time": "2023-12-04T18:41:44.953219", + "start_time": "2024-01-04T08:12:41.928289", "status": "completed" }, "tags": [] @@ -774,20 +799,20 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "df0553e7-3d19-4fcb-b294-69c5a2292e9c", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.964899Z", - "iopub.status.busy": "2023-12-04T18:41:44.964532Z", - "iopub.status.idle": "2023-12-04T18:41:44.967322Z", - "shell.execute_reply": "2023-12-04T18:41:44.966839Z" + "iopub.execute_input": "2024-01-04T08:12:41.946007Z", + "iopub.status.busy": "2024-01-04T08:12:41.945692Z", + "iopub.status.idle": "2024-01-04T08:12:41.948252Z", + "shell.execute_reply": "2024-01-04T08:12:41.947852Z" }, "papermill": { - "duration": 0.007795, - "end_time": "2023-12-04T18:41:44.968127", + "duration": 0.009676, + "end_time": "2024-01-04T08:12:41.949051", "exception": false, - "start_time": "2023-12-04T18:41:44.960332", + "start_time": "2024-01-04T08:12:41.939375", "status": "completed" }, "tags": [] @@ -799,20 +824,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "545070ae-9722-4ed4-a4e7-48faf33c912d", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:44.976314Z", - "iopub.status.busy": "2023-12-04T18:41:44.975930Z", - "iopub.status.idle": "2023-12-04T18:41:44.984832Z", - "shell.execute_reply": "2023-12-04T18:41:44.984351Z" + "iopub.execute_input": "2024-01-04T08:12:41.961113Z", + "iopub.status.busy": "2024-01-04T08:12:41.960725Z", + "iopub.status.idle": "2024-01-04T08:12:41.969932Z", + "shell.execute_reply": "2024-01-04T08:12:41.969517Z" }, "papermill": { - "duration": 0.013807, - "end_time": "2023-12-04T18:41:44.985601", + "duration": 0.016083, + "end_time": "2024-01-04T08:12:41.970729", "exception": false, - "start_time": "2023-12-04T18:41:44.971794", + "start_time": "2024-01-04T08:12:41.954646", "status": "completed" }, "tags": [] @@ -852,10 +877,10 @@ "id": "16c45c61-8ad7-4fea-9853-c41f60baf976", "metadata": { "papermill": { - "duration": 0.003491, - "end_time": "2023-12-04T18:41:44.992790", + "duration": 0.005457, + "end_time": "2024-01-04T08:12:41.981814", "exception": false, - "start_time": "2023-12-04T18:41:44.989299", + "start_time": "2024-01-04T08:12:41.976357", "status": "completed" }, "tags": [] @@ -866,20 +891,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "d066318e-1f1c-4032-87c8-cf2557a62230", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.000891Z", - "iopub.status.busy": "2023-12-04T18:41:45.000544Z", - "iopub.status.idle": "2023-12-04T18:41:45.003241Z", - "shell.execute_reply": "2023-12-04T18:41:45.002763Z" + "iopub.execute_input": "2024-01-04T08:12:41.993927Z", + "iopub.status.busy": "2024-01-04T08:12:41.993578Z", + "iopub.status.idle": "2024-01-04T08:12:41.996208Z", + "shell.execute_reply": "2024-01-04T08:12:41.995804Z" }, "papermill": { - "duration": 0.00766, - "end_time": "2023-12-04T18:41:45.004040", + "duration": 0.009648, + "end_time": "2024-01-04T08:12:41.997010", "exception": false, - "start_time": "2023-12-04T18:41:44.996380", + "start_time": "2024-01-04T08:12:41.987362", "status": "completed" }, "tags": [] @@ -891,20 +916,20 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "beb68af6-dd8a-4da3-a532-7477aa0c6fbc", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.012218Z", - "iopub.status.busy": "2023-12-04T18:41:45.011802Z", - "iopub.status.idle": "2023-12-04T18:41:45.026217Z", - "shell.execute_reply": "2023-12-04T18:41:45.025444Z" + "iopub.execute_input": "2024-01-04T08:12:42.009067Z", + "iopub.status.busy": "2024-01-04T08:12:42.008919Z", + "iopub.status.idle": "2024-01-04T08:12:42.015367Z", + "shell.execute_reply": "2024-01-04T08:12:42.014962Z" }, "papermill": { - "duration": 0.019741, - "end_time": "2023-12-04T18:41:45.027430", + "duration": 0.01332, + "end_time": "2024-01-04T08:12:42.016147", "exception": false, - "start_time": "2023-12-04T18:41:45.007689", + "start_time": "2024-01-04T08:12:42.002827", "status": "completed" }, "tags": [] @@ -937,10 +962,10 @@ "id": "c51eaf1c-4bff-40ef-9c76-fcea77ca90c5", "metadata": { "papermill": { - "duration": 0.003683, - "end_time": "2023-12-04T18:41:45.055546", + "duration": 0.005469, + "end_time": "2024-01-04T08:12:42.027184", "exception": false, - "start_time": "2023-12-04T18:41:45.051863", + "start_time": "2024-01-04T08:12:42.021715", "status": "completed" }, "tags": [] @@ -951,20 +976,81 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "9328a89b-dbd7-4faf-adeb-8a9a3a60af9e", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.063643Z", - "iopub.status.busy": "2023-12-04T18:41:45.063451Z", - "iopub.status.idle": "2023-12-04T18:41:45.068616Z", - "shell.execute_reply": "2023-12-04T18:41:45.068217Z" + "iopub.execute_input": "2024-01-04T08:12:42.039065Z", + "iopub.status.busy": "2024-01-04T08:12:42.038915Z", + "iopub.status.idle": "2024-01-04T08:12:42.046013Z", + "shell.execute_reply": "2024-01-04T08:12:42.045610Z" + }, + "papermill": { + "duration": 0.01402, + "end_time": "2024-01-04T08:12:42.046797", + "exception": false, + "start_time": "2024-01-04T08:12:42.032777", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "datasets = {\n", + " idx: df.drop(columns=\"dataset\") for idx, df in datasets_df.groupby(\"dataset\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "555a7d7c-2ea1-471a-8ada-77fce5837bf7", + "metadata": { + "papermill": { + "duration": 0.005533, + "end_time": "2024-01-04T08:12:42.057935", + "exception": false, + "start_time": "2024-01-04T08:12:42.052402", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Create dataset dictionary" + ] + }, + { + "cell_type": "markdown", + "id": "d1589084-f27a-44b6-a064-e22093d2a351", + "metadata": { + "papermill": { + "duration": 0.005452, + "end_time": "2024-01-04T08:12:42.068936", + "exception": false, + "start_time": "2024-01-04T08:12:42.063484", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Create a dictionary with easier access to datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e6c54042-e751-4fd7-96db-5274cd7d8eb1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:12:42.081143Z", + "iopub.status.busy": "2024-01-04T08:12:42.080658Z", + "iopub.status.idle": "2024-01-04T08:12:42.087775Z", + "shell.execute_reply": "2024-01-04T08:12:42.087366Z" }, "papermill": { - "duration": 0.010164, - "end_time": "2023-12-04T18:41:45.069340", + "duration": 0.014062, + "end_time": "2024-01-04T08:12:42.088587", "exception": false, - "start_time": "2023-12-04T18:41:45.059176", + "start_time": "2024-01-04T08:12:42.074525", "status": "completed" }, "tags": [] @@ -981,34 +1067,34 @@ "id": "e9110ac9-5968-405f-9070-eca3f9389a54", "metadata": { "papermill": { - "duration": 0.003585, - "end_time": "2023-12-04T18:41:45.076607", + "duration": 0.00545, + "end_time": "2024-01-04T08:12:42.099904", "exception": false, - "start_time": "2023-12-04T18:41:45.073022", + "start_time": "2024-01-04T08:12:42.094454", "status": "completed" }, "tags": [] }, "source": [ - "# Plot" + "## Plot" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "0109f1d9-22ff-42fd-8bf8-ab84f449cbbd", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.084391Z", - "iopub.status.busy": "2023-12-04T18:41:45.084301Z", - "iopub.status.idle": "2023-12-04T18:41:45.089433Z", - "shell.execute_reply": "2023-12-04T18:41:45.089056Z" + "iopub.execute_input": "2024-01-04T08:12:42.111901Z", + "iopub.status.busy": "2024-01-04T08:12:42.111566Z", + "iopub.status.idle": "2024-01-04T08:12:42.119638Z", + "shell.execute_reply": "2024-01-04T08:12:42.119224Z" }, "papermill": { - "duration": 0.009929, - "end_time": "2023-12-04T18:41:45.090158", + "duration": 0.014956, + "end_time": "2024-01-04T08:12:42.120420", "exception": false, - "start_time": "2023-12-04T18:41:45.080229", + "start_time": "2024-01-04T08:12:42.105464", "status": "completed" }, "tags": [] @@ -1055,20 +1141,20 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "id": "8201a1b3-a455-41ac-913f-98b3ba9363d8", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.098358Z", - "iopub.status.busy": "2023-12-04T18:41:45.098020Z", - "iopub.status.idle": "2023-12-04T18:41:45.101537Z", - "shell.execute_reply": "2023-12-04T18:41:45.101060Z" + "iopub.execute_input": "2024-01-04T08:12:42.132228Z", + "iopub.status.busy": "2024-01-04T08:12:42.132075Z", + "iopub.status.idle": "2024-01-04T08:12:42.135326Z", + "shell.execute_reply": "2024-01-04T08:12:42.134911Z" }, "papermill": { - "duration": 0.008542, - "end_time": "2023-12-04T18:41:45.102363", + "duration": 0.010096, + "end_time": "2024-01-04T08:12:42.136093", "exception": false, - "start_time": "2023-12-04T18:41:45.093821", + "start_time": "2024-01-04T08:12:42.125997", "status": "completed" }, "tags": [] @@ -1089,20 +1175,20 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "id": "bbd15cd4-c9ea-40a7-a08c-c2a15781ce4e", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.110622Z", - "iopub.status.busy": "2023-12-04T18:41:45.110294Z", - "iopub.status.idle": "2023-12-04T18:41:45.114063Z", - "shell.execute_reply": "2023-12-04T18:41:45.113590Z" + "iopub.execute_input": "2024-01-04T08:12:42.148155Z", + "iopub.status.busy": "2024-01-04T08:12:42.147782Z", + "iopub.status.idle": "2024-01-04T08:12:42.151272Z", + "shell.execute_reply": "2024-01-04T08:12:42.150866Z" }, "papermill": { - "duration": 0.008765, - "end_time": "2023-12-04T18:41:45.114869", + "duration": 0.010335, + "end_time": "2024-01-04T08:12:42.152056", "exception": false, - "start_time": "2023-12-04T18:41:45.106104", + "start_time": "2024-01-04T08:12:42.141721", "status": "completed" }, "tags": [] @@ -1120,20 +1206,20 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "id": "55ecad23-cf11-4ed5-a81d-c7faa16ad8b9", "metadata": { "execution": { - "iopub.execute_input": "2023-12-04T18:41:45.123092Z", - "iopub.status.busy": "2023-12-04T18:41:45.122673Z", - "iopub.status.idle": "2023-12-04T18:42:09.576208Z", - "shell.execute_reply": "2023-12-04T18:42:09.575801Z" + "iopub.execute_input": "2024-01-04T08:12:42.164056Z", + "iopub.status.busy": "2024-01-04T08:12:42.163732Z", + "iopub.status.idle": "2024-01-04T08:13:07.098399Z", + "shell.execute_reply": "2024-01-04T08:13:07.097965Z" }, "papermill": { - "duration": 24.459916, - "end_time": "2023-12-04T18:42:09.578416", + "duration": 24.943006, + "end_time": "2024-01-04T08:13:07.100650", "exception": false, - "start_time": "2023-12-04T18:41:45.118500", + "start_time": "2024-01-04T08:12:42.157644", "status": "completed" }, "tags": [] @@ -1141,7 +1227,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -1169,9 +1255,44 @@ " ],\n", " col_wrap=4,\n", " height=5,\n", + " sharey=False,\n", + " sharex=False,\n", " )\n", " g.map(sns.scatterplot, \"x\", \"y\", s=50, alpha=1)\n", " g.set_titles(row_template=\"{row_name}\", col_template=\"{col_name}\")\n", + " g.set_xticklabels([])\n", + " g.set_yticklabels([])\n", + "\n", + " for ax_idx, ax in enumerate(g.axes):\n", + " ax.set_xlabel(\"$x$\")\n", + "\n", + " if ax_idx == 0:\n", + " ax.text(\n", + " -0.10,\n", + " 1.02,\n", + " \"a)\",\n", + " fontweight=\"semibold\",\n", + " fontsize=20,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"left\",\n", + " )\n", + " elif ax_idx == 4:\n", + " ax.text(\n", + " -0.10,\n", + " 1.02,\n", + " \"b)\",\n", + " fontweight=\"semibold\",\n", + " fontsize=20,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"left\",\n", + " )\n", + "\n", + " if ax_idx in (0, 4):\n", + " ax.set_ylabel(\"$y$\")\n", + " continue\n", + "\n", + " sns.despine(ax=ax, left=True)\n", + " ax.yaxis.set_tick_params(left=False)\n", "\n", " mono = {\"family\": \"monospace\"}\n", "\n", @@ -1184,8 +1305,9 @@ " rs, rs_p = spearmanr(x, y)\n", "\n", " # ccc\n", - " c, max_parts, parts = ccc(x, y, return_parts=True)\n", - " c, c_p = ccc(x, y, pvalue_n_perms=10000)\n", + " (c, c_p), max_parts, parts = ccc(\n", + " x, y, return_parts=True, pvalue_n_perms=PVALUE_N_PERMS\n", + " )\n", "\n", " x_line_points, y_line_points = get_cm_line_points(x, y, max_parts, parts)\n", " for yp in y_line_points:\n", @@ -1213,8 +1335,10 @@ " linespacing=1.1,\n", " )\n", "\n", + " plt.subplots_adjust(hspace=0.2)\n", + "\n", " plt.savefig(\n", - " OUTPUT_FIGURE_DIR / \"relationships.svg\",\n", + " OUTPUT_FIGURE_DIR / \"numerical_relationships.svg\",\n", " # rasterized=True,\n", " # dpi=300,\n", " bbox_inches=\"tight\",\n", @@ -1227,10 +1351,10 @@ "id": "385c9fdb-823b-4eda-a551-dcdbe62e0943", "metadata": { "papermill": { - "duration": 0.008003, - "end_time": "2023-12-04T18:42:09.594882", + "duration": 0.011159, + "end_time": "2024-01-04T08:13:07.123639", "exception": false, - "start_time": "2023-12-04T18:42:09.586879", + "start_time": "2024-01-04T08:13:07.112480", "status": "completed" }, "tags": [] @@ -1246,16 +1370,914 @@ "1. With two internal clusters (Anscombe I, II and III) for each variable pair, CCC seems to capture linear relationships. However, two clusters also capture non-coexistence relationships." ] }, + { + "cell_type": "markdown", + "id": "5026f8e5-dc07-43ff-9596-76093dae779b", + "metadata": { + "papermill": { + "duration": 0.006172, + "end_time": "2024-01-04T08:13:07.136794", + "exception": false, + "start_time": "2024-01-04T08:13:07.130622", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Categorical datasets" + ] + }, + { + "cell_type": "markdown", + "id": "2de70ce2-3d93-411c-8b3b-a61fb6b57c1a", + "metadata": { + "papermill": { + "duration": 0.006188, + "end_time": "2024-01-04T08:13:07.149577", + "exception": false, + "start_time": "2024-01-04T08:13:07.143389", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Independent Two-Categorical" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a6811486-7838-4454-9441-97c1929808a3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.168691Z", + "iopub.status.busy": "2024-01-04T08:13:07.168423Z", + "iopub.status.idle": "2024-01-04T08:13:07.171405Z", + "shell.execute_reply": "2024-01-04T08:13:07.170965Z" + }, + "papermill": { + "duration": 0.017214, + "end_time": "2024-01-04T08:13:07.173257", + "exception": false, + "start_time": "2024-01-04T08:13:07.156043", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rel_name = \"Two-Categorical I\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "68fe500f-22a3-4661-b727-a9372279ce53", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.193657Z", + "iopub.status.busy": "2024-01-04T08:13:07.193111Z", + "iopub.status.idle": "2024-01-04T08:13:07.198340Z", + "shell.execute_reply": "2024-01-04T08:13:07.197786Z" + }, + "papermill": { + "duration": 0.016171, + "end_time": "2024-01-04T08:13:07.199124", + "exception": false, + "start_time": "2024-01-04T08:13:07.182953", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(4)\n", + "\n", + "x = np.random.choice([\"A\", \"B\", \"C\"], 100)\n", + "y = np.random.choice([\"Orange\", \"Blue\"], 100)\n", + "\n", + "cat_datasets_df = pd.DataFrame(\n", + " {\n", + " \"dataset\": rel_name,\n", + " \"x\": x,\n", + " \"y\": y,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "998b963a-fa66-4d45-a38e-536a39a32286", + "metadata": { + "papermill": { + "duration": 0.006511, + "end_time": "2024-01-04T08:13:07.211942", + "exception": false, + "start_time": "2024-01-04T08:13:07.205431", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Two-Categorical" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "5f0b47d9-be00-4d30-beb2-667f5a12e847", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.225488Z", + "iopub.status.busy": "2024-01-04T08:13:07.225237Z", + "iopub.status.idle": "2024-01-04T08:13:07.228078Z", + "shell.execute_reply": "2024-01-04T08:13:07.227597Z" + }, + "papermill": { + "duration": 0.010618, + "end_time": "2024-01-04T08:13:07.228844", + "exception": false, + "start_time": "2024-01-04T08:13:07.218226", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rel_name = \"Two-Categorical II\"" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2b28f3fa-883d-45b8-82cb-6c766b2998a1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.242644Z", + "iopub.status.busy": "2024-01-04T08:13:07.242241Z", + "iopub.status.idle": "2024-01-04T08:13:07.251663Z", + "shell.execute_reply": "2024-01-04T08:13:07.251160Z" + }, + "papermill": { + "duration": 0.017316, + "end_time": "2024-01-04T08:13:07.252462", + "exception": false, + "start_time": "2024-01-04T08:13:07.235146", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)\n", + "\n", + "x = np.random.choice([\"A\", \"B\", \"C\"], 100)\n", + "y = np.random.choice([\"Orange\", \"Blue\"], 100)\n", + "\n", + "_df = pd.DataFrame(\n", + " {\n", + " \"dataset\": rel_name,\n", + " \"x\": x,\n", + " \"y\": y,\n", + " }\n", + ")\n", + "\n", + "_df.loc[_df[_df[\"x\"] == \"A\"].sample(frac=0.50).index, \"y\"] = \"Blue\"\n", + "_df.loc[_df[_df[\"x\"] == \"B\"].sample(frac=0.75).index, \"y\"] = \"Orange\"\n", + "\n", + "cat_datasets_df = cat_datasets_df[~cat_datasets_df[\"dataset\"].isin((rel_name,))]\n", + "cat_datasets_df = cat_datasets_df.append(\n", + " _df,\n", + " ignore_index=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "555497ee-42ee-45cf-acbc-5f9e2dbe95d0", + "metadata": { + "papermill": { + "duration": 0.007351, + "end_time": "2024-01-04T08:13:07.266416", + "exception": false, + "start_time": "2024-01-04T08:13:07.259065", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Independent Two-Categorical-One-Numerical" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "1460089c-4615-48be-819a-e8c9087291fd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.280190Z", + "iopub.status.busy": "2024-01-04T08:13:07.279799Z", + "iopub.status.idle": "2024-01-04T08:13:07.282674Z", + "shell.execute_reply": "2024-01-04T08:13:07.282209Z" + }, + "papermill": { + "duration": 0.010715, + "end_time": "2024-01-04T08:13:07.283496", + "exception": false, + "start_time": "2024-01-04T08:13:07.272781", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rel_name = \"Categorical-Numerical I\"" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d981cfbb-989a-4ea9-9be3-4f26f63f20ce", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.297872Z", + "iopub.status.busy": "2024-01-04T08:13:07.297390Z", + "iopub.status.idle": "2024-01-04T08:13:07.304697Z", + "shell.execute_reply": "2024-01-04T08:13:07.304207Z" + }, + "papermill": { + "duration": 0.015514, + "end_time": "2024-01-04T08:13:07.305522", + "exception": false, + "start_time": "2024-01-04T08:13:07.290008", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(10)\n", + "\n", + "x = np.random.choice([\"A\", \"B\", \"C\"], 100)\n", + "y = np.random.rand(100)\n", + "y = minmax_scale(y, y_lim)\n", + "\n", + "cat_datasets_df = cat_datasets_df[~cat_datasets_df[\"dataset\"].isin((rel_name,))]\n", + "cat_datasets_df = cat_datasets_df.append(\n", + " pd.DataFrame(\n", + " {\n", + " \"dataset\": rel_name,\n", + " \"x\": x,\n", + " \"y\": y,\n", + " }\n", + " ),\n", + " ignore_index=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5a54ea55-fbb0-45e5-877a-411cd0e7db8a", + "metadata": { + "papermill": { + "duration": 0.006371, + "end_time": "2024-01-04T08:13:07.318775", + "exception": false, + "start_time": "2024-01-04T08:13:07.312404", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Two-Categorical-One-Numerical" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f102d770-4c21-40b2-8b5f-e47f93b8eebc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.332874Z", + "iopub.status.busy": "2024-01-04T08:13:07.332548Z", + "iopub.status.idle": "2024-01-04T08:13:07.335620Z", + "shell.execute_reply": "2024-01-04T08:13:07.335147Z" + }, + "papermill": { + "duration": 0.010963, + "end_time": "2024-01-04T08:13:07.336423", + "exception": false, + "start_time": "2024-01-04T08:13:07.325460", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "rel_name = \"Categorical-Numerical II\"" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5e3ca2d6-d9af-4164-9636-d3116bec8122", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.350196Z", + "iopub.status.busy": "2024-01-04T08:13:07.349865Z", + "iopub.status.idle": "2024-01-04T08:13:07.362836Z", + "shell.execute_reply": "2024-01-04T08:13:07.362261Z" + }, + "papermill": { + "duration": 0.020718, + "end_time": "2024-01-04T08:13:07.363683", + "exception": false, + "start_time": "2024-01-04T08:13:07.342965", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)\n", + "\n", + "x = np.random.choice([\"A\", \"B\", \"C\"], 100)\n", + "y = np.nan\n", + "\n", + "_df = pd.DataFrame(\n", + " {\n", + " \"dataset\": rel_name,\n", + " \"x\": x,\n", + " \"y\": y,\n", + " }\n", + ")\n", + "\n", + "_idx = _df[_df[\"x\"] == \"A\"].index # .sample(frac=0.50).index\n", + "_df.loc[_idx, \"y\"] = np.random.normal(0, 0.50, _idx.shape[0])\n", + "\n", + "_idx = _df[_df[\"x\"] == \"B\"].index # sample(frac=0.75).index\n", + "_df.loc[_idx, \"y\"] = np.random.normal(1, 0.25, _idx.shape[0])\n", + "\n", + "_idx = _df[_df[\"x\"] == \"C\"].index # sample(frac=0.75).index\n", + "_df.loc[_idx, \"y\"] = np.random.normal(1, 0.75, _idx.shape[0])\n", + "\n", + "_df[\"y\"] = minmax_scale(_df[\"y\"], y_lim)\n", + "\n", + "# _idx = _df[_df[\"x\"] == \"B\"].sample(frac=0.75).index\n", + "# _df.loc[_idx, \"y\"] = np.random.normal(0, 1, _idx.shape[0])\n", + "\n", + "cat_datasets_df = cat_datasets_df[~cat_datasets_df[\"dataset\"].isin((rel_name,))]\n", + "cat_datasets_df = cat_datasets_df.append(\n", + " _df,\n", + " ignore_index=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6e772c48-9f23-4986-a1f1-d4dfe7347a34", + "metadata": { + "papermill": { + "duration": 0.006672, + "end_time": "2024-01-04T08:13:07.376930", + "exception": false, + "start_time": "2024-01-04T08:13:07.370258", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Create dataset dictionary" + ] + }, + { + "cell_type": "markdown", + "id": "e6a136ac-cde2-45ab-8513-9219adf0e25e", + "metadata": { + "papermill": { + "duration": 0.006454, + "end_time": "2024-01-04T08:13:07.389775", + "exception": false, + "start_time": "2024-01-04T08:13:07.383321", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Create a dictionary with easier access to datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6f05696d-aca7-4325-9e67-35cc698dabbc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.403611Z", + "iopub.status.busy": "2024-01-04T08:13:07.403365Z", + "iopub.status.idle": "2024-01-04T08:13:07.409416Z", + "shell.execute_reply": "2024-01-04T08:13:07.408935Z" + }, + "papermill": { + "duration": 0.013946, + "end_time": "2024-01-04T08:13:07.410173", + "exception": false, + "start_time": "2024-01-04T08:13:07.396227", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "categorical_datasets = {\n", + " idx: df.drop(columns=\"dataset\") for idx, df in cat_datasets_df.groupby(\"dataset\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "0c8e5e42-b157-41b1-9fb3-89c8aede99b2", + "metadata": { + "papermill": { + "duration": 0.006395, + "end_time": "2024-01-04T08:13:07.423046", + "exception": false, + "start_time": "2024-01-04T08:13:07.416651", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Plot" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "fb3a70a0-6325-486b-986b-b76c639c1298", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.436962Z", + "iopub.status.busy": "2024-01-04T08:13:07.436586Z", + "iopub.status.idle": "2024-01-04T08:13:07.445238Z", + "shell.execute_reply": "2024-01-04T08:13:07.444759Z" + }, + "papermill": { + "duration": 0.016448, + "end_time": "2024-01-04T08:13:07.446016", + "exception": false, + "start_time": "2024-01-04T08:13:07.429568", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datasetxy
0Two-Categorical ICOrange
1Two-Categorical ICOrange
2Two-Categorical IBBlue
3Two-Categorical IBBlue
4Two-Categorical IABlue
............
395Categorical-Numerical IIA7.029552
396Categorical-Numerical IIC10.915389
397Categorical-Numerical IIA8.293415
398Categorical-Numerical IIB9.111649
399Categorical-Numerical IIC8.893113
\n", + "

400 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " dataset x y\n", + "0 Two-Categorical I C Orange\n", + "1 Two-Categorical I C Orange\n", + "2 Two-Categorical I B Blue\n", + "3 Two-Categorical I B Blue\n", + "4 Two-Categorical I A Blue\n", + ".. ... .. ...\n", + "395 Categorical-Numerical II A 7.029552\n", + "396 Categorical-Numerical II C 10.915389\n", + "397 Categorical-Numerical II A 8.293415\n", + "398 Categorical-Numerical II B 9.111649\n", + "399 Categorical-Numerical II C 8.893113\n", + "\n", + "[400 rows x 3 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_datasets_df" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "506ff63d-c1f8-43f0-9261-4b491077c64b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.460801Z", + "iopub.status.busy": "2024-01-04T08:13:07.460307Z", + "iopub.status.idle": "2024-01-04T08:13:07.465973Z", + "shell.execute_reply": "2024-01-04T08:13:07.465504Z" + }, + "papermill": { + "duration": 0.014039, + "end_time": "2024-01-04T08:13:07.466813", + "exception": false, + "start_time": "2024-01-04T08:13:07.452774", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_cm_line_points(y, max_parts, parts):\n", + " \"\"\"\n", + " Similar to previous function definition of same name, but only takes one variable.\n", + " \"\"\"\n", + " y_max_part = parts[1][max_parts[1]]\n", + " y_unique_k = {}\n", + " for k in np.unique(y_max_part):\n", + " data = y[y_max_part == k]\n", + " y_unique_k[k] = data.min(), data.max()\n", + " y_unique_k = sorted(y_unique_k.items(), key=lambda x: x[1][0])\n", + "\n", + " # x_line_points, y_line_points = [], []\n", + " y_line_points = []\n", + "\n", + " for idx in range(len(y_unique_k) - 1):\n", + " k, (k_min, k_max) = y_unique_k[idx]\n", + " nk, (nk_min, nk_max) = y_unique_k[idx + 1]\n", + "\n", + " y_line_points.append((k_max + nk_min) / 2.0)\n", + "\n", + " return y_line_points" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6492c42b-41ed-4cc4-abd7-251642434de0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.481724Z", + "iopub.status.busy": "2024-01-04T08:13:07.481356Z", + "iopub.status.idle": "2024-01-04T08:13:07.486323Z", + "shell.execute_reply": "2024-01-04T08:13:07.485839Z" + }, + "papermill": { + "duration": 0.013412, + "end_time": "2024-01-04T08:13:07.487095", + "exception": false, + "start_time": "2024-01-04T08:13:07.473683", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def plot_internal(x, y, **kwargs):\n", + " x = pd.to_numeric(x, errors=\"ignore\")\n", + " y = pd.to_numeric(y, errors=\"ignore\")\n", + "\n", + " x_is_num = pd.api.types.is_numeric_dtype(x.dtype)\n", + " y_is_num = pd.api.types.is_numeric_dtype(y.dtype)\n", + "\n", + " if not x_is_num and not y_is_num:\n", + " sns.countplot(x=x, hue=y, order=[\"A\", \"B\", \"C\"], hue_order=[\"Blue\", \"Orange\"])\n", + " elif not x_is_num and y_is_num:\n", + " sns.swarmplot(x=x, y=y, order=[\"A\", \"B\", \"C\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "8fb2bace-ba2f-4f61-b394-adf4b7cd8d71", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:07.502083Z", + "iopub.status.busy": "2024-01-04T08:13:07.501672Z", + "iopub.status.idle": "2024-01-04T08:13:10.740964Z", + "shell.execute_reply": "2024-01-04T08:13:10.740544Z" + }, + "papermill": { + "duration": 3.247458, + "end_time": "2024-01-04T08:13:10.741751", + "exception": false, + "start_time": "2024-01-04T08:13:07.494293", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "with sns.plotting_context(\"paper\", font_scale=1.8):\n", + " g = sns.FacetGrid(\n", + " data=cat_datasets_df,\n", + " col=\"dataset\",\n", + " col_order=[\n", + " \"Two-Categorical I\",\n", + " \"Two-Categorical II\",\n", + " \"Categorical-Numerical I\",\n", + " \"Categorical-Numerical II\",\n", + " ],\n", + " col_wrap=4,\n", + " height=5,\n", + " legend_out=False,\n", + " sharex=False,\n", + " sharey=False,\n", + " )\n", + " g.map(plot_internal, \"x\", \"y\", s=1, alpha=1)\n", + " g.set_titles(row_template=\"{row_name}\", col_template=\"{col_name}\")\n", + " g.add_legend()\n", + " # g.set_xticklabels([])\n", + " g.set_yticklabels([])\n", + "\n", + " for ax_idx, ax in enumerate(g.axes):\n", + " if ax_idx in (1, 3):\n", + " sns.despine(ax=ax, left=True)\n", + " ax.yaxis.set_tick_params(left=False)\n", + "\n", + " if ax_idx in (0,):\n", + " ax.set(ylabel=\"$w$\", xlabel=\"$z$\")\n", + "\n", + " ax.text(\n", + " -0.10,\n", + " 1.02,\n", + " \"c)\",\n", + " fontweight=\"semibold\",\n", + " fontsize=20,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"left\",\n", + " )\n", + "\n", + " if ax_idx in (1,):\n", + " ax.set(xlabel=\"$z$\")\n", + "\n", + " if ax_idx in (2,):\n", + " ax.set(ylabel=\"$y$\", xlabel=\"$z$\")\n", + "\n", + " ax.text(\n", + " -0.10,\n", + " 1.02,\n", + " \"d)\",\n", + " fontweight=\"semibold\",\n", + " fontsize=20,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"left\",\n", + " )\n", + "\n", + " if ax_idx in (3,):\n", + " ax.set(xlabel=\"$z$\")\n", + "\n", + " mono = {\"family\": \"monospace\"}\n", + "\n", + " for ds, ax in g.axes_dict.items():\n", + " df = categorical_datasets[ds]\n", + " x, y = df[\"x\"], df[\"y\"]\n", + " x = pd.to_numeric(x, errors=\"ignore\").to_numpy()\n", + " y = pd.to_numeric(y, errors=\"ignore\").to_numpy()\n", + "\n", + " x_is_num = pd.api.types.is_numeric_dtype(x.dtype)\n", + " y_is_num = pd.api.types.is_numeric_dtype(y.dtype)\n", + "\n", + " # ccc\n", + " (c, c_p), max_parts, parts = ccc(\n", + " x, y, return_parts=True, pvalue_n_perms=PVALUE_N_PERMS\n", + " )\n", + "\n", + " if y_is_num:\n", + " y_line_points = get_cm_line_points(y, max_parts, parts)\n", + " for yp in y_line_points:\n", + " ax.hlines(y=yp, xmin=-0.5, xmax=20, color=\"r\", alpha=0.5)\n", + "\n", + " # add text box for the statistics\n", + " stats = f\"$\\it{{c}}$ ={c: .2f}{pvalue_to_star(c_p)}\"\n", + " bbox = dict(boxstyle=\"round\", fc=\"white\", ec=\"black\", alpha=0.75)\n", + " ax.text(\n", + " 0.69,\n", + " 0.07,\n", + " stats,\n", + " fontsize=14,\n", + " fontdict=mono,\n", + " bbox=bbox,\n", + " transform=ax.transAxes,\n", + " horizontalalignment=\"left\",\n", + " linespacing=1.1,\n", + " )\n", + "\n", + " plt.savefig(\n", + " OUTPUT_FIGURE_DIR / \"categorical_relationships.svg\",\n", + " # rasterized=True,\n", + " # dpi=300,\n", + " bbox_inches=\"tight\",\n", + " facecolor=\"white\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "c4b46cc0-86ac-4801-89e9-fd145ac0bdc1", + "metadata": { + "papermill": { + "duration": 0.011166, + "end_time": "2024-01-04T08:13:10.760357", + "exception": false, + "start_time": "2024-01-04T08:13:10.749191", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Create final figure" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "57090b29-80c6-4ae5-8d2e-a1f256f8a58c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:10.784321Z", + "iopub.status.busy": "2024-01-04T08:13:10.784158Z", + "iopub.status.idle": "2024-01-04T08:13:10.793460Z", + "shell.execute_reply": "2024-01-04T08:13:10.793072Z" + }, + "papermill": { + "duration": 0.024855, + "end_time": "2024-01-04T08:13:10.794268", + "exception": false, + "start_time": "2024-01-04T08:13:10.769413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from svgutils.compose import Figure, SVG, Panel" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e5a39467-e448-43a9-a9ba-9f947bfff1ed", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-04T08:13:10.809565Z", + "iopub.status.busy": "2024-01-04T08:13:10.809477Z", + "iopub.status.idle": "2024-01-04T08:13:10.817833Z", + "shell.execute_reply": "2024-01-04T08:13:10.817479Z" + }, + "papermill": { + "duration": 0.016932, + "end_time": "2024-01-04T08:13:10.818624", + "exception": false, + "start_time": "2024-01-04T08:13:10.801692", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "Figure(\n", + " \"8.0cm\",\n", + " \"6.5cm\",\n", + " Panel(\n", + " SVG(OUTPUT_FIGURE_DIR / \"numerical_relationships.svg\").scale(0.005),\n", + " ),\n", + " Panel(\n", + " SVG(OUTPUT_FIGURE_DIR / \"categorical_relationships.svg\").scale(0.005),\n", + " ).move(0, 3.45),\n", + ").save(OUTPUT_FIGURE_DIR / f\"relationships.svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "25237272-7e8a-4e3b-8958-d1660ddcb6b0", + "metadata": { + "papermill": { + "duration": 0.007052, + "end_time": "2024-01-04T08:13:10.833316", + "exception": false, + "start_time": "2024-01-04T08:13:10.826264", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now open the file, reside to fit drawing to page, and add a white rectangle to the background." + ] + }, { "cell_type": "code", "execution_count": null, "id": "1853d043-2f41-448a-8609-6c3eb8f83b7c", "metadata": { "papermill": { - "duration": 0.008003, - "end_time": "2023-12-04T18:42:09.611043", + "duration": 0.007065, + "end_time": "2024-01-04T08:13:10.847486", "exception": false, - "start_time": "2023-12-04T18:42:09.603040", + "start_time": "2024-01-04T08:13:10.840421", "status": "completed" }, "tags": [] @@ -1288,14 +2310,14 @@ }, "papermill": { "default_parameters": {}, - "duration": 27.127077, - "end_time": "2023-12-04T18:42:10.036390", + "duration": 31.557573, + "end_time": "2024-01-04T08:13:11.273592", "environment_variables": {}, "exception": null, "input_path": "nbs/99_manuscript/intro/05-relationships_analysis.ipynb", "output_path": "nbs/99_manuscript/intro/05-relationships_analysis.run.ipynb", "parameters": {}, - "start_time": "2023-12-04T18:41:42.909313", + "start_time": "2024-01-04T08:12:39.716019", "version": "2.3.3" }, "toc-autonumbering": true diff --git a/nbs/99_manuscript/intro/py/05-relationships_analysis.py b/nbs/99_manuscript/intro/py/05-relationships_analysis.py index 5a5c9086..6e81db52 100644 --- a/nbs/99_manuscript/intro/py/05-relationships_analysis.py +++ b/nbs/99_manuscript/intro/py/05-relationships_analysis.py @@ -45,6 +45,9 @@ # %% [markdown] tags=[] # # Settings +# %% tags=[] +PVALUE_N_PERMS = 10000 + # %% [markdown] tags=[] # # Paths @@ -59,7 +62,7 @@ display(OUTPUT_FIGURE_DIR) # %% [markdown] tags=[] -# # Generate datasets +# # Numerical datasets # %% [markdown] tags=[] # ## Anscombe dataset @@ -220,9 +223,20 @@ idx: df.drop(columns="dataset") for idx, df in datasets_df.groupby("dataset") } +# %% [markdown] tags=[] +# ## Create dataset dictionary # %% [markdown] tags=[] -# # Plot +# Create a dictionary with easier access to datasets + +# %% tags=[] +datasets = { + idx: df.drop(columns="dataset") for idx, df in datasets_df.groupby("dataset") +} + + +# %% [markdown] tags=[] +# ## Plot # %% tags=[] def get_cm_line_points(x, y, max_parts, parts): @@ -302,9 +316,44 @@ def pvalue_to_star(pvalue): ], col_wrap=4, height=5, + sharey=False, + sharex=False, ) g.map(sns.scatterplot, "x", "y", s=50, alpha=1) g.set_titles(row_template="{row_name}", col_template="{col_name}") + g.set_xticklabels([]) + g.set_yticklabels([]) + + for ax_idx, ax in enumerate(g.axes): + ax.set_xlabel("$x$") + + if ax_idx == 0: + ax.text( + -0.10, + 1.02, + "a)", + fontweight="semibold", + fontsize=20, + transform=ax.transAxes, + horizontalalignment="left", + ) + elif ax_idx == 4: + ax.text( + -0.10, + 1.02, + "b)", + fontweight="semibold", + fontsize=20, + transform=ax.transAxes, + horizontalalignment="left", + ) + + if ax_idx in (0, 4): + ax.set_ylabel("$y$") + continue + + sns.despine(ax=ax, left=True) + ax.yaxis.set_tick_params(left=False) mono = {"family": "monospace"} @@ -317,8 +366,9 @@ def pvalue_to_star(pvalue): rs, rs_p = spearmanr(x, y) # ccc - c, max_parts, parts = ccc(x, y, return_parts=True) - c, c_p = ccc(x, y, pvalue_n_perms=10000) + (c, c_p), max_parts, parts = ccc( + x, y, return_parts=True, pvalue_n_perms=PVALUE_N_PERMS + ) x_line_points, y_line_points = get_cm_line_points(x, y, max_parts, parts) for yp in y_line_points: @@ -346,8 +396,10 @@ def pvalue_to_star(pvalue): linespacing=1.1, ) + plt.subplots_adjust(hspace=0.2) + plt.savefig( - OUTPUT_FIGURE_DIR / "relationships.svg", + OUTPUT_FIGURE_DIR / "numerical_relationships.svg", # rasterized=True, # dpi=300, bbox_inches="tight", @@ -364,4 +416,302 @@ def pvalue_to_star(pvalue): # 1. When the number of internal clusters (separated by red lines) is higher, CCC is able to capture more complex relationships. # 1. With two internal clusters (Anscombe I, II and III) for each variable pair, CCC seems to capture linear relationships. However, two clusters also capture non-coexistence relationships. +# %% [markdown] tags=[] +# # Categorical datasets + +# %% [markdown] tags=[] +# ## Independent Two-Categorical + +# %% tags=[] +rel_name = "Two-Categorical I" + +# %% tags=[] +np.random.seed(4) + +x = np.random.choice(["A", "B", "C"], 100) +y = np.random.choice(["Orange", "Blue"], 100) + +cat_datasets_df = pd.DataFrame( + { + "dataset": rel_name, + "x": x, + "y": y, + } +) + +# %% [markdown] tags=[] +# ## Two-Categorical + +# %% tags=[] +rel_name = "Two-Categorical II" + +# %% tags=[] +np.random.seed(0) + +x = np.random.choice(["A", "B", "C"], 100) +y = np.random.choice(["Orange", "Blue"], 100) + +_df = pd.DataFrame( + { + "dataset": rel_name, + "x": x, + "y": y, + } +) + +_df.loc[_df[_df["x"] == "A"].sample(frac=0.50).index, "y"] = "Blue" +_df.loc[_df[_df["x"] == "B"].sample(frac=0.75).index, "y"] = "Orange" + +cat_datasets_df = cat_datasets_df[~cat_datasets_df["dataset"].isin((rel_name,))] +cat_datasets_df = cat_datasets_df.append( + _df, + ignore_index=True, +) + +# %% [markdown] tags=[] +# ## Independent Two-Categorical-One-Numerical + +# %% tags=[] +rel_name = "Categorical-Numerical I" + +# %% tags=[] +np.random.seed(10) + +x = np.random.choice(["A", "B", "C"], 100) +y = np.random.rand(100) +y = minmax_scale(y, y_lim) + +cat_datasets_df = cat_datasets_df[~cat_datasets_df["dataset"].isin((rel_name,))] +cat_datasets_df = cat_datasets_df.append( + pd.DataFrame( + { + "dataset": rel_name, + "x": x, + "y": y, + } + ), + ignore_index=True, +) + +# %% [markdown] tags=[] +# ## Two-Categorical-One-Numerical + +# %% tags=[] +rel_name = "Categorical-Numerical II" + +# %% tags=[] +np.random.seed(0) + +x = np.random.choice(["A", "B", "C"], 100) +y = np.nan + +_df = pd.DataFrame( + { + "dataset": rel_name, + "x": x, + "y": y, + } +) + +_idx = _df[_df["x"] == "A"].index # .sample(frac=0.50).index +_df.loc[_idx, "y"] = np.random.normal(0, 0.50, _idx.shape[0]) + +_idx = _df[_df["x"] == "B"].index # sample(frac=0.75).index +_df.loc[_idx, "y"] = np.random.normal(1, 0.25, _idx.shape[0]) + +_idx = _df[_df["x"] == "C"].index # sample(frac=0.75).index +_df.loc[_idx, "y"] = np.random.normal(1, 0.75, _idx.shape[0]) + +_df["y"] = minmax_scale(_df["y"], y_lim) + +# _idx = _df[_df["x"] == "B"].sample(frac=0.75).index +# _df.loc[_idx, "y"] = np.random.normal(0, 1, _idx.shape[0]) + +cat_datasets_df = cat_datasets_df[~cat_datasets_df["dataset"].isin((rel_name,))] +cat_datasets_df = cat_datasets_df.append( + _df, + ignore_index=True, +) + +# %% [markdown] tags=[] +# ## Create dataset dictionary + +# %% [markdown] tags=[] +# Create a dictionary with easier access to datasets + +# %% tags=[] +categorical_datasets = { + idx: df.drop(columns="dataset") for idx, df in cat_datasets_df.groupby("dataset") +} + +# %% [markdown] tags=[] +# ## Plot + +# %% tags=[] +cat_datasets_df + + +# %% tags=[] +def get_cm_line_points(y, max_parts, parts): + """ + Similar to previous function definition of same name, but only takes one variable. + """ + y_max_part = parts[1][max_parts[1]] + y_unique_k = {} + for k in np.unique(y_max_part): + data = y[y_max_part == k] + y_unique_k[k] = data.min(), data.max() + y_unique_k = sorted(y_unique_k.items(), key=lambda x: x[1][0]) + + # x_line_points, y_line_points = [], [] + y_line_points = [] + + for idx in range(len(y_unique_k) - 1): + k, (k_min, k_max) = y_unique_k[idx] + nk, (nk_min, nk_max) = y_unique_k[idx + 1] + + y_line_points.append((k_max + nk_min) / 2.0) + + return y_line_points + + +# %% tags=[] +def plot_internal(x, y, **kwargs): + x = pd.to_numeric(x, errors="ignore") + y = pd.to_numeric(y, errors="ignore") + + x_is_num = pd.api.types.is_numeric_dtype(x.dtype) + y_is_num = pd.api.types.is_numeric_dtype(y.dtype) + + if not x_is_num and not y_is_num: + sns.countplot(x=x, hue=y, order=["A", "B", "C"], hue_order=["Blue", "Orange"]) + elif not x_is_num and y_is_num: + sns.swarmplot(x=x, y=y, order=["A", "B", "C"]) + + +# %% tags=[] +with sns.plotting_context("paper", font_scale=1.8): + g = sns.FacetGrid( + data=cat_datasets_df, + col="dataset", + col_order=[ + "Two-Categorical I", + "Two-Categorical II", + "Categorical-Numerical I", + "Categorical-Numerical II", + ], + col_wrap=4, + height=5, + legend_out=False, + sharex=False, + sharey=False, + ) + g.map(plot_internal, "x", "y", s=1, alpha=1) + g.set_titles(row_template="{row_name}", col_template="{col_name}") + g.add_legend() + # g.set_xticklabels([]) + g.set_yticklabels([]) + + for ax_idx, ax in enumerate(g.axes): + if ax_idx in (1, 3): + sns.despine(ax=ax, left=True) + ax.yaxis.set_tick_params(left=False) + + if ax_idx in (0,): + ax.set(ylabel="$w$", xlabel="$z$") + + ax.text( + -0.10, + 1.02, + "c)", + fontweight="semibold", + fontsize=20, + transform=ax.transAxes, + horizontalalignment="left", + ) + + if ax_idx in (1,): + ax.set(xlabel="$z$") + + if ax_idx in (2,): + ax.set(ylabel="$y$", xlabel="$z$") + + ax.text( + -0.10, + 1.02, + "d)", + fontweight="semibold", + fontsize=20, + transform=ax.transAxes, + horizontalalignment="left", + ) + + if ax_idx in (3,): + ax.set(xlabel="$z$") + + mono = {"family": "monospace"} + + for ds, ax in g.axes_dict.items(): + df = categorical_datasets[ds] + x, y = df["x"], df["y"] + x = pd.to_numeric(x, errors="ignore").to_numpy() + y = pd.to_numeric(y, errors="ignore").to_numpy() + + x_is_num = pd.api.types.is_numeric_dtype(x.dtype) + y_is_num = pd.api.types.is_numeric_dtype(y.dtype) + + # ccc + (c, c_p), max_parts, parts = ccc( + x, y, return_parts=True, pvalue_n_perms=PVALUE_N_PERMS + ) + + if y_is_num: + y_line_points = get_cm_line_points(y, max_parts, parts) + for yp in y_line_points: + ax.hlines(y=yp, xmin=-0.5, xmax=20, color="r", alpha=0.5) + + # add text box for the statistics + stats = f"$\it{{c}}$ ={c: .2f}{pvalue_to_star(c_p)}" + bbox = dict(boxstyle="round", fc="white", ec="black", alpha=0.75) + ax.text( + 0.69, + 0.07, + stats, + fontsize=14, + fontdict=mono, + bbox=bbox, + transform=ax.transAxes, + horizontalalignment="left", + linespacing=1.1, + ) + + plt.savefig( + OUTPUT_FIGURE_DIR / "categorical_relationships.svg", + # rasterized=True, + # dpi=300, + bbox_inches="tight", + facecolor="white", + ) + +# %% [markdown] tags=[] +# # Create final figure + +# %% tags=[] +from svgutils.compose import Figure, SVG, Panel + +# %% tags=[] +Figure( + "8.0cm", + "6.5cm", + Panel( + SVG(OUTPUT_FIGURE_DIR / "numerical_relationships.svg").scale(0.005), + ), + Panel( + SVG(OUTPUT_FIGURE_DIR / "categorical_relationships.svg").scale(0.005), + ).move(0, 3.45), +).save(OUTPUT_FIGURE_DIR / f"relationships.svg") + +# %% [markdown] tags=[] +# Now open the file, reside to fit drawing to page, and add a white rectangle to the background. + # %% tags=[]