Add the updated instructions for downloading data

determined-ai · Apr 29, 2024 · bcc3c78 · bcc3c78
1 parent d1f1e71
commit bcc3c78
Show file tree

Hide file tree

Showing 10 changed files with 263 additions and 57 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@
 .tool-versions
 .DS_Store
 .direnv/
-
+*.nii.gz
+examples/3d-brain-mri/sample-data/3d-brain.json
+examples/3d-brain-mri/sample-data/dataset-3d-brain.zip
diff --git a/examples/3d-brain-mri/experiment/dist.yaml b/examples/3d-brain-mri/experiment/dist.yaml
@@ -12,7 +12,7 @@ data:
     branch:
     token:
 hyperparameters:
-  global_batch_size: 8
+  global_batch_size: 10
   input_channels: 4
   learning_rate: 1.e-4
   dropout: 0.0
@@ -34,7 +34,7 @@ searcher:
   metric: val_Dice
   smaller_is_better: False
   max_length:
-    epochs: 10
+    epochs: 20
 min_validation_period:
   epochs: 1
 perform_initial_validation: True

diff --git a/examples/3d-brain-mri/experiment/model_code/utils.py b/examples/3d-brain-mri/experiment/model_code/utils.py
@@ -36,12 +36,12 @@ def get_transforms(trial_context):
         PairedToTensor(),
         PairedCrop(height=trial_context.get_hparam("volume_height"),
                    width=trial_context.get_hparam("volume_width"),
-                   depth=trial_context.get_hparam("volume_depth")),,
-        PairedNormalize(trial_context.get_hparam("normalization"))
+                   depth=trial_context.get_hparam("volume_depth")),
+        PairedNormalize(trial_context.get_hparam("normalization")),
         PairedRandomAffine(degrees=(trial_context.get_hparam("affine_degrees_min"), trial_context.get_hparam("affine_degrees_max")),
                            translate=(trial_context.get_hparam("affine_translate_min"), trial_context.get_hparam("affine_translate_max")),
                            scale_ranges=(trial_context.get_hparam("affine_scale_min"), trial_context.get_hparam("affine_scale_max"))),
-        PairedRandomHorizontalFlip(trial_context.get_hparam("hflip_pct")),
+        PairedRandomHorizontalFlip(trial_context.get_hparam("hflip_pct"))
     ])
     eval_transforms = transforms.Compose([
         PairedToTensor(),

diff --git a/examples/3d-brain-mri/readme.md b/examples/3d-brain-mri/readme.md
@@ -4,37 +4,42 @@
 
 # PDK - Pachyderm | Determined | KServe
 ## 3D Brain MRI Example
-**Date/Revision:** January 02, 2024
+**Date/Revision:** April 30, 2024
 
-This example is based on the **Brain MRI Segmentation** Kaggle dataset, which can be found here:<br/>
-https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation
+This example is based on the **UCSF-PDGM: The University of California San Francisco Preoperative Diffuse Glioma MRI** research dataset, which can be found here:<br/>
+https://www.cancerimagingarchive.net/collection/ucsf-pdgm/
 
-The original dataset contains images from 110 patients that can be used for model training. In the [sample-data](./sample-data/) folder, you will find a small subset of that ([dataset-brain.zip](./sample-data/dataset-brain.zip)), which will be used to train the model. Naturally, you can download the full dataset from Kaggle and use it instead.
+The original dataset contains data from 495 unique subjects. The dataset is formed by taking several MRI scans for each patient, “skull stripping” the scan (leaving just the brain image), and de-identifying the patient. The result is 4 MRI volumes per subject, as well as a target segmentation mask. In the [sample-data](./sample-data/) folder, you will find a small subset of the data from 87 subjects ([dataset-3d-brain.zip](./sample-data/dataset-3d-brain.zip)), which will be used to train the model. Given the size of the subset data being 1.1 GiB, the data is stores using git large file storage (`git lfs`). To download the data, use the following commands from within the repo after cloning it:
+
+```bash
+git lfs install
+git lfs pull
+```
 
 To setup this PDK flow, please follow the instructions in the [Deployment](../../deploy/README.md#setup) page. Since that page is referring to the [Dogs vs Cats](../dog-cat/readme.md) example, you should make the following changes:
 
 Create the following folder structure in the storage bucket (can be skipped for vanilla kubernetes deployments):
 
 ```bash
-brain-mri
-brain-mri/config
-brain-mri/model-store
+pdk-3d-brain-mri
+pdk-3d-brain-mri/config
+pdk-3d-brain-mri/model-store
 ```
 
 &nbsp;
 
-The project name should be `pdk-brain-mri`, and the input repository should be called `brain-mri-data`:
+The project name should be `pdk-3d-brain-mri`, and the input repository should be called `3d-brain-mri-data`:
 
 ```bash
 pachctl connect ${MLDM_URL}
 
 pachctl config set active-context ${MLDM_URL}
 
-pachctl create project pdk-brain-mri
+pachctl create project pdk-3d-brain-mri
 
-pachctl config update context --project pdk-brain-mri
+pachctl config update context --project pdk-3d-brain-mri
 
-pachctl create repo brain-mri-data
+pachctl create repo 3d-brain-mri-data
 
 pachctl list repo
 ```
@@ -43,10 +48,10 @@ pachctl list repo
 
 **MLDE Experiment Project:**
 
-By default, the MLDE experiment will use the `pdk-brain-mri` Project inside the `PDK Demos` workspace. Make sure to create that project before uploading files to the MLDM repo.
+By default, the MLDE experiment will use the `pdk-3d-brain-mri` Project inside the `PDK Demos` workspace. Make sure to create that project before uploading files to the MLDM repo.
 
 ```bash
-det p create "PDK Demos" pdk-brain-mri
+det p create "PDK Demos" pdk-3d-brain-mri
 ```
 
 &nbsp;
@@ -61,26 +66,26 @@ pachctl create pipeline -f deployment-pipeline.json
 
 &nbsp;
 
-To upload files to MLDM, go to the `sample-data` folder, unzip the dataset and use the `put file` command to upload:
+To upload files to MLDM, go to the `sample-data` folder, [download the dataset (`dataset-3d-brain.zip`) and sample json (`3d-brain.json`) payload](https://drive.google.com/drive/folders/1du5eHMRE6VOzUkYRmLdfCdmHRaoBryyy?usp=drive_link), and unzip the dataset and use the `put file` command to upload:
 
 ```bash
-unzip dataset-brain.zip
+unzip dataset-3d-brain.zip -d data
 
-find ./brain/ -name '.DS_Store' -type f -delete
+find ./data/ -name '.DS_Store' -type f -delete
 
-pachctl put file brain-mri-data@master:/data1 -f ./brain -r
+pachctl put file 3d-brain-mri-data@master:/data -f ./data -r
 ```
 
 &nbsp;
 
-Finally, to test the inference service, look for the `brain-mri-deploy` service hostname, and use the `brain.json` file located in the [sample-data](./sample-data/) folder:
+Finally, to test the inference service, look for the `pdk-3d-brain-mri-deploy` service hostname, and use the `3d-brain.json` file located in the [sample-data](./sample-data/) folder:
 
 ```bash
 export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
 
 export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].port}')
 
-export SERVICE_HOSTNAME=$(kubectl get inferenceservice brain-mri-deploy -n ${KSERVE_MODELS_NAMESPACE} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+export SERVICE_HOSTNAME=$(kubectl get inferenceservice pdk-3d-brain-mri-deploy -n ${KSERVE_MODELS_NAMESPACE} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
 
 echo $INGRESS_HOST
 
@@ -92,8 +97,8 @@ echo $SERVICE_HOSTNAME
 curl -v \
 -H "Content-Type: application/json" \
 -H "Host: ${SERVICE_HOSTNAME}" \
-http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/brain-mri:predict \
--d @./brain.json
+http://${INGRESS_HOST}:${INGRESS_PORT}/v2/models/brain-mri/infer \
+-d @./3d-brain.json
 ```
 
 PS: Depending on your load balancer, you may need to use `.status.loadBalancer.ingress[0].hostname` instead of `.status.loadBalancer.ingress[0].ip` for the `INGRESS_HOST` variable.

diff --git a/...ri/sample-data/Brain-MRI_Prediction.ipynb → ...sample-data/3D-Brain-MRI_Prediction.ipynb b/...ri/sample-data/Brain-MRI_Prediction.ipynb → ...sample-data/3D-Brain-MRI_Prediction.ipynb
@@ -13,10 +13,10 @@
    "id": "ba3033d6-3d69-4911-968a-26f90c2ff764",
    "metadata": {},
    "source": [
-    "<h1>Brain MRI Prediction from KServe InferenceService</h1>\n",
+    "<h1>3D Brain MRI Prediction from KServe InferenceService</h1>\n",
     "\n",
-    "<b>Date: 01/02/24</b><br/>\n",
-    "<b>Version: MLDM 2.8.2 | MLDE 0.26.7 | KServe 11.2</b><br/>\n",
+    "<b>Date: 04/29/24</b><br/>\n",
+    "<b>Version: MLDM 2.8.2 | MLDE 0.27.0 | KServe 11.2</b><br/>\n",
     "<b>Authors: HPE AI At Scale SE Team</b>"
    ]
   },
@@ -47,11 +47,18 @@
     "# Imports\n",
     "import sys\n",
     "import json\n",
+    "import uuid\n",
     "import torch\n",
     "import base64\n",
     "import requests\n",
     "import numpy as np\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Import functions for downloading data\n",
+    "from utils.load_data import download_pach_repo, download_data, get_train_transforms\n",
+    "\n",
+    "# Import model util functions\n",
+    "from utils.data_utils import plot_masked_volumes, plot_mask_vs_preds, load_patient_volume\n"
    ]
   },
   {
@@ -78,10 +85,10 @@
    "outputs": [],
    "source": [
     "# Define connection details for KServe InferenceService\n",
-    "model_name = \"brain-mri\"\n",
-    "ingress_host = \"192.168.1.2\"\n",
+    "model_name = \"pdk-3d-brain-mri-deploy\"\n",
+    "ingress_host = \"10.182.1.44\"\n",
     "ingress_port = \"80\"\n",
-    "service_hostname = \"brain-mri-deploy.models.example.com\""
+    "service_hostname = \"pdk-3d-brain-mri-deploy.models.mlds-kserve.us.rdlabs.hpecorp.net\""
    ]
   },
   {
@@ -99,7 +106,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"brain.json\", \"r\") as brainfile:\n",
+    "with open(\"3d-brain.json\", \"r\") as brainfile:\n",
     "    brainjson = json.loads(brainfile.read())"
    ]
   },
@@ -131,9 +138,10 @@
     }
    ],
    "source": [
-    "plt.figure(figsize=(5,5))\n",
-    "plt.title(f'Submitted Image: ')\n",
-    "plt.imshow(brainjson['inputs'][0]['data'][0])"
+    "animation_fig = plot_masked_volumes(brainjson['inputs'][0]['data'][0], np.zeros(brainjson['inputs'][0]['data'][0].shape), save=False)\n",
+    "\n",
+    "# Show the animation\n",
+    "animation_fig"
    ]
   },
   {
@@ -166,11 +174,19 @@
     "response = requests.post(url, data=payload, headers=headers)\n",
     "responsej = response.json()\n",
     "\n",
-    "shape = [1, 256, 256]\n",
+    "shape = [144, 224, 224]\n",
     "values = responsej[\"outputs\"][0][\"data\"]\n",
     "output = torch.Tensor(np.array(values).reshape(shape))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c20b5ec9",
+   "metadata": {},
+   "source": [
+    "## Send the inference request"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0b7f7082-354f-46e0-ba8f-4a0ece3d6671",
@@ -197,20 +213,12 @@
     }
    ],
    "source": [
-    "tensor_sample = brainjson['inputs'][0]['data'][0]\n",
-    "f, axarr = plt.subplots(1,2, figsize=(10, 10))\n",
-    "axarr[0].imshow(tensor_sample, alpha=0.6)\n",
-    "axarr[0].title.set_text(f'Original Image:')\n",
-    "axarr[1].imshow(output.permute(1, 2, 0), alpha=0.6)\n",
-    "axarr[1].title.set_text(f'Mask (Prediction):')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ac921e2e",
-   "metadata": {},
-   "source": [
-    " "
+    "%matplotlib notebook\n",
+    "\n",
+    "animation_fig = plot_mask_vs_preds(brainjson['inputs'][0]['data'][0], output, figsize=(10,4), save=False)\n",
+    "\n",
+    "# Show the animation\n",
+    "animation_fig"
    ]
   },
   {
@@ -236,9 +244,7 @@
     "import base64\n",
     "import requests\n",
     "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "from PIL import Image"
+    "import matplotlib.pyplot as plt"
    ]
   },
   {

diff --git a/examples/3d-brain-mri/sample-data/brain.json b/examples/3d-brain-mri/sample-data/brain.json
diff --git a/examples/3d-brain-mri/sample-data/dataset-brain.zip b/examples/3d-brain-mri/sample-data/dataset-brain.zip
diff --git a/examples/3d-brain-mri/sample-data/utils/__init__.py b/examples/3d-brain-mri/sample-data/utils/__init__.py
diff --git a/examples/3d-brain-mri/sample-data/utils/data_utils.py b/examples/3d-brain-mri/sample-data/utils/data_utils.py
@@ -0,0 +1,108 @@
+import os
+import random
+import numpy as np
+import nibabel as nib
+from pathlib import Path
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from IPython.display import clear_output
+plt.rcParams['animation.html'] = 'jshtml'
+plt.rcParams['animation.embed_limit'] = 2**128
+
+
+def load_patient_volume(dir_path, idx=None):
+
+    all_patient_paths = set(nifti_file.parent for nifti_file in Path(dir_path).rglob('*.nii*'))
+    if idx is None or idx < len(all_patient_paths):
+        patient_path = random.choice(list(all_patient_paths))
+    else:
+        patient_path = list(all_patient_paths)[idx]
+    vol_FLAIR = nib.load(next(patient_path.rglob('*FLAIR.nii*'))).get_fdata(dtype=np.float32).T
+    vol_T1c = nib.load(next(patient_path.rglob('*T1c.nii*'))).get_fdata(dtype=np.float32).T
+    vol_T2 = nib.load(next(patient_path.rglob('*T2.nii*'))).get_fdata(dtype=np.float32).T
+    vol_SWI = nib.load(next(patient_path.rglob('*SWI.nii*'))).get_fdata(dtype=np.float32).T
+    vol_mask = nib.load(next(patient_path.rglob('*tumor*.nii*'))).get_fdata(dtype=np.float32).T
+
+    multimodal_vol = np.stack([vol_FLAIR,vol_T1c,vol_T2,vol_SWI])
+
+    return multimodal_vol, vol_mask
+
+
+def plot_masked_volumes(mri_vol, mri_mask, figsize=(20,4), save=False, norm=0.4):
+
+    modalities, (min_slice,mid_slice,max_slice) = preprocess_volumes(mri_vol,mri_mask)
+    norm_mri_mask = (mri_mask/mri_mask.max())*norm
+    fig, ax = plt.subplots(1,len(modalities), figsize=figsize)
+
+    ims = []
+    for n, (vol,name) in enumerate(modalities):
+        # Set the initial image
+        ims += [ax[n].imshow(vol[mid_slice,...] + norm_mri_mask[mid_slice,...], aspect='auto', animated=True)]
+        ax[n].set_title(name)
+        ax[n].set_axis_off()
+
+    def update(i):
+        for n, (vol,_) in enumerate(modalities):
+            ims[n].set_data(vol[min_slice:max_slice,...][i] + norm_mri_mask[min_slice:max_slice,...][i])
+        return ims
+
+    clear_output()
+
+    # Create the animation object
+    animation_fig = animation.FuncAnimation(fig, update, frames=max_slice-min_slice, interval=100, blit=True, repeat_delay=10)
+
+    # Show the animation
+    animation_fig
+    if save:
+        animation_fig.save('./img/all_mri_mask.gif', writer='pillow')
+
+    # Show the animation
+    return animation_fig
+
+def plot_mask_vs_preds(mri_preds, mri_mask, figsize=(10,4), save=False):
+
+    multimodal_mri = np.stack([mri_preds,mri_preds,mri_preds,mri_preds])
+    modalities, (min_slice,mid_slice,max_slice) = preprocess_volumes(multimodal_mri,mri_mask)
+    fig, ax = plt.subplots(1,2,figsize=figsize)
+
+    vol,_ = modalities[0]
+    ims = []
+    # Set the initial image
+    ims = [ax[0].imshow(vol[mid_slice,...], aspect='auto', animated=True),
+           ax[1].imshow(mri_mask[mid_slice,...], aspect='auto', animated=True)]
+    ax[0].set_title(f'Predicted')
+    ax[1].set_title(f'Mask')
+    ax[0].set_axis_off()
+    ax[1].set_axis_off()
+
+    def update(i):
+        ims[0].set_data(vol[min_slice:max_slice,...][i])
+        ims[1].set_data(mri_mask[min_slice:max_slice,...][i])
+        return ims
+
+    clear_output()
+
+    # Create the animation object
+    animation_fig = animation.FuncAnimation(fig, update, frames=max_slice-min_slice, interval=100, blit=True, repeat_delay=10)
+
+    if save:
+        animation_fig.save('./img/mri_preds_vs_mask.gif', writer='pillow')
+
+    # Show the animation
+    return animation_fig
+
+
+def preprocess_volumes(mri_vol, mri_mask):
+
+    norm_mri_vol = mri_vol/mri_vol.max(axis=(1,2,3)).reshape(4,1,1,1)
+    modalities = [(norm_mri_vol[0,...], 'FLAIR'), (norm_mri_vol[1,...], 'T1c'), (norm_mri_vol[2,...], 'T2'), (norm_mri_vol[3,...], 'SWI')]
+
+    min_mask_slice = np.argwhere(mri_mask)[:,0].min()
+    max_mask_slice = np.argwhere(mri_mask)[:,0].max()
+    min_vol_slice = np.argwhere(mri_vol[0,...])[:,0].min()
+    max_vol_slice = np.argwhere(mri_vol[0,...])[:,0].max()
+    min_slice = min_vol_slice + abs(min_mask_slice - min_vol_slice)//2
+    max_slice = max_vol_slice - abs(max_vol_slice - max_mask_slice)//2
+    mid_slice = min_slice + (max_slice - min_slice)//2
+
+    return modalities, (min_slice,mid_slice,max_slice)