Merge pull request #23 from calico/revision-upd-3

Revision update to some scripts and example notebooks.
calico · Jul 12, 2024 · eeee3fe · eeee3fe
2 parents 0c28e87 + 37e9d5c
commit eeee3fe
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -38,15 +38,15 @@ These repositories further depend on a number of python packages (which are auto
 A new conda environment can be created with `conda create -n borzoi_py39 python=3.9`.
 
 ### Model Availability
-The model weights can be downloaded as .h5 files from the following URLs:
+The model weights can be downloaded as .h5 files from the URLs below. We trained a total of 4 model replicates with identical train, validation and test splits (test = fold3, validation = fold4 from [sequences_human.bed.gz](https://github.com/calico/borzoi/blob/main/data/sequences_human.bed.gz)).
 
-[Borzoi V2 Cross-fold 0](https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5)<br/>
-[Borzoi V2 Cross-fold 1](https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5)<br/>
-[Borzoi V2 Cross-fold 2](https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5)<br/>
-[Borzoi V2 Cross-fold 3](https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5)<br/>
+[Borzoi V2 Replicate 0](https://storage.googleapis.com/seqnn-share/borzoi/f0/model0_best.h5)<br/>
+[Borzoi V2 Replicate 1](https://storage.googleapis.com/seqnn-share/borzoi/f1/model0_best.h5)<br/>
+[Borzoi V2 Replicate 2](https://storage.googleapis.com/seqnn-share/borzoi/f2/model0_best.h5)<br/>
+[Borzoi V2 Replicate 3](https://storage.googleapis.com/seqnn-share/borzoi/f3/model0_best.h5)<br/>
 
 #### Mini Borzoi Models
-We have trained a collection of (smaller) model instances on various subsets of data modalities (or on all data modalities but with architectural changes compared to the original architecture). For example, some models are trained only on RNA-seq data while others are trained on DNase-, ATAC- and RNA-seq. Similarly, some model instances are trained on human-only data while others are trained on human- and mouse data. The models are available at the URL below:
+We have trained a collection of (smaller) model instances on various subsets of data modalities (or on all data modalities but with architectural changes compared to the original architecture). For example, some models are trained only on RNA-seq data while others are trained on DNase-, ATAC- and RNA-seq. Similarly, some model instances are trained on human-only data while others are trained on human- and mouse data. The models were trained with either 2- or 4-fold cross-validation and are available at the following URL:
 
 [Mini Borzoi Model Collection](https://storage.googleapis.com/seqnn-share/borzoi/mini/)<br/>
 

diff --git a/examples/borzoi_example_eqtl_chr10_116952944_T_C.ipynb b/examples/borzoi_example_eqtl_chr10_116952944_T_C.ipynb
@@ -73,13 +73,13 @@
     "%%bash\n",
     "\n",
     "#Download model weights\n",
-    "for fold in f0 f1 f2 f3; do\n",
-    "  mkdir -p \"saved_models/$fold/\"\n",
-    "  local_model=\"saved_models/$fold/model0_best.h5\"\n",
+    "for rep in f0 f1 f2 f3; do\n",
+    "  mkdir -p \"saved_models/$rep/\"\n",
+    "  local_model=\"saved_models/$rep/model0_best.h5\"\n",
     "  if [ -f \"$local_model\" ]; then\n",
-    "    echo \"$fold model already exists.\"\n",
+    "    echo \"$rep model already exists.\"\n",
     "  else\n",
-    "    wget --progress=bar:force \"https://storage.googleapis.com/seqnn-share/borzoi/$fold/model0_best.h5\" -O \"$local_model\"\n",
+    "    wget --progress=bar:force \"https://storage.googleapis.com/seqnn-share/borzoi/$rep/model0_best.h5\" -O \"$local_model\"\n",
     "  fi\n",
     "done\n",
     "\n",
@@ -150,7 +150,7 @@
     "targets_file = 'targets_gtex.txt' #Subset of targets_human.txt\n",
     "\n",
     "seq_len = 524288\n",
-    "n_folds = 1       #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.\n",
+    "n_reps = 1       #To use only one model replicate, set to 'n_reps = 1'. To use all four replicates, set 'n_reps = 4'.\n",
     "rc = True         #Average across reverse-complement prediction\n",
     "\n",
     "#Read model parameters\n",
@@ -179,16 +179,16 @@
     "#Initialize model ensemble\n",
     "\n",
     "models = []\n",
-    "for fold_ix in range(n_folds) :\n",
+    "for rep_ix in range(n_reps) :\n",
     "    \n",
-    "    model_file = \"saved_models/f\" + str(fold_ix) + \"/model0_best.h5\"\n",
+    "    model_file = \"saved_models/f\" + str(rep_ix) + \"/model0_best.h5\"\n",
     "\n",
     "    seqnn_model = seqnn.SeqNN(params_model)\n",
     "    seqnn_model.restore(model_file, 0)\n",
     "    seqnn_model.build_slice(target_index)\n",
     "    if rc :\n",
     "        seqnn_model.strand_pair.append(slice_pair)\n",
-    "    seqnn_model.build_ensemble(rc, '0')\n",
+    "    seqnn_model.build_ensemble(rc, [0])\n",
     "    \n",
     "    models.append(seqnn_model)\n"
    ]
@@ -370,7 +370,7 @@
    "source": [
     "%%time\n",
     "#Predict for chr10_116952944_T_C\n",
-    "# (~6 minutes on CPU w 1 fold; ~2 minutes on GPU)\n",
+    "# (~6 minutes on CPU w 1 replicate; ~2 minutes on GPU)\n",
     "\n",
     "save_figs = False\n",
     "save_suffix = '_chr10_116952944_T_C'\n",
@@ -492,7 +492,7 @@
     "    use_ratio=False,\n",
     "    use_logodds=False,\n",
     "    subtract_avg=True,\n",
-    "    fold_index=np.arange(n_folds).tolist(),\n",
+    "    fold_index=np.arange(n_reps).tolist(),\n",
     ")\n"
    ]
   },
@@ -546,7 +546,7 @@
     "    plot_start=(poses[0] - start) - 64,\n",
     "    plot_end=(poses[0] - start) + 64,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_folds_gtex_blood_cov_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_reps_gtex_blood_cov_undo_clip'\n",
     ")\n"
    ]
   },
@@ -642,7 +642,7 @@
     "    plot_start=(poses[0] - start) - 64,\n",
     "    plot_end=(poses[0] - start) + 64,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_folds_gtex_blood_cov_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_reps_gtex_blood_cov_undo_clip'\n",
     ")\n"
    ]
   },
@@ -740,7 +740,7 @@
     "    plot_start=(poses[0] - start) - 64,\n",
     "    plot_end=(poses[0] - start) + 64,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_folds_gtex_blood_cov_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_reps_gtex_blood_cov_undo_clip'\n",
     ")\n"
    ]
   },
@@ -886,9 +886,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:borzoi_py39_2]",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "conda-env-borzoi_py39_2-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -900,7 +900,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.17"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,

diff --git a/examples/borzoi_example_ipaqtl_chr10_116664061_G_A.ipynb b/examples/borzoi_example_ipaqtl_chr10_116664061_G_A.ipynb
@@ -73,13 +73,13 @@
     "%%bash\n",
     "\n",
     "#Download model weights\n",
-    "for fold in f0 f1 f2 f3; do\n",
-    "  mkdir -p \"saved_models/$fold/\"\n",
-    "  local_model=\"saved_models/$fold/model0_best.h5\"\n",
+    "for rep in f0 f1 f2 f3; do\n",
+    "  mkdir -p \"saved_models/$rep/\"\n",
+    "  local_model=\"saved_models/$rep/model0_best.h5\"\n",
     "  if [ -f \"$local_model\" ]; then\n",
-    "    echo \"$fold model already exists.\"\n",
+    "    echo \"$rep model already exists.\"\n",
     "  else\n",
-    "    wget --progress=bar:force \"https://storage.googleapis.com/seqnn-share/borzoi/$fold/model0_best.h5\" -O \"$local_model\"\n",
+    "    wget --progress=bar:force \"https://storage.googleapis.com/seqnn-share/borzoi/$rep/model0_best.h5\" -O \"$local_model\"\n",
     "  fi\n",
     "done\n",
     "\n",
@@ -150,7 +150,7 @@
     "targets_file = 'targets_gtex.txt' #Subset of targets_human.txt\n",
     "\n",
     "seq_len = 524288\n",
-    "n_folds = 1       #To use only one model fold, set to 'n_folds = 1'. To use all four folds, set 'n_folds = 4'.\n",
+    "n_reps = 1       #To use only one model replicate, set to 'n_reps = 1'. To use all four replicates, set 'n_reps = 4'.\n",
     "rc = True         #Average across reverse-complement prediction\n",
     "\n",
     "#Read model parameters\n",
@@ -179,16 +179,16 @@
     "#Initialize model ensemble\n",
     "\n",
     "models = []\n",
-    "for fold_ix in range(n_folds) :\n",
+    "for rep_ix in range(n_reps) :\n",
     "    \n",
-    "    model_file = \"saved_models/f\" + str(fold_ix) + \"/model0_best.h5\"\n",
+    "    model_file = \"saved_models/f\" + str(rep_ix) + \"/model0_best.h5\"\n",
     "\n",
     "    seqnn_model = seqnn.SeqNN(params_model)\n",
     "    seqnn_model.restore(model_file, 0)\n",
     "    seqnn_model.build_slice(target_index)\n",
     "    if rc :\n",
     "        seqnn_model.strand_pair.append(slice_pair)\n",
-    "    seqnn_model.build_ensemble(rc, '0')\n",
+    "    seqnn_model.build_ensemble(rc, [0])\n",
     "    \n",
     "    models.append(seqnn_model)\n"
    ]
@@ -329,7 +329,7 @@
    "source": [
     "%%time\n",
     "#Predict for chr10_116664061_G_A\n",
-    "# (~6 minutes on CPU w 1 fold; ~15 seconds on GPU)\n",
+    "# (~6 minutes on CPU w 1 replicate; ~15 seconds on GPU)\n",
     "\n",
     "save_figs = False\n",
     "save_suffix = '_chr10_116664061_G_A'\n",
@@ -518,7 +518,7 @@
     "    use_ratio=True,\n",
     "    use_logodds=False,\n",
     "    subtract_avg=True,\n",
-    "    fold_index=np.arange(n_folds).tolist(),\n",
+    "    fold_index=np.arange(n_reps).tolist(),\n",
     ")\n"
    ]
   },
@@ -572,7 +572,7 @@
     "    plot_start=seq_len - (poses[0] - start) - 64 - 1,\n",
     "    plot_end=seq_len - (poses[0] - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_folds_gtex_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_reps_gtex_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -667,7 +667,7 @@
     "    plot_start=seq_len - (poses[0] - start) - 64 - 1,\n",
     "    plot_end=seq_len - (poses[0] - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_folds_gtex_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_reps_gtex_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -764,7 +764,7 @@
     "    plot_start=seq_len - (poses[0] - start) - 64 - 1,\n",
     "    plot_end=seq_len - (poses[0] - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_folds_gtex_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_reps_gtex_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -825,7 +825,7 @@
     "    use_ratio=True,\n",
     "    use_logodds=False,\n",
     "    subtract_avg=True,\n",
-    "    fold_index=np.arange(n_folds).tolist(),\n",
+    "    fold_index=np.arange(n_reps).tolist(),\n",
     ")\n"
    ]
   },
@@ -879,7 +879,7 @@
     "    plot_start=seq_len - (pas_pos - start) - 64 - 1,\n",
     "    plot_end=seq_len - (pas_pos - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_folds_gtex_pas_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_grad_gtex_snp_4_reps_gtex_pas_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -974,7 +974,7 @@
     "    plot_start=seq_len - (pas_pos - start) - 64 - 1,\n",
     "    plot_end=seq_len - (pas_pos - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_folds_gtex_pas_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_gtex_snp_4_reps_gtex_pas_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -1073,7 +1073,7 @@
     "    plot_start=seq_len - (pas_pos - start) - 64 - 1,\n",
     "    plot_end=seq_len - (pas_pos - start) + 64 - 1,\n",
     "    save_figs=False,\n",
-    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_folds_gtex_pas_cov_nerve_undo_clip'\n",
+    "    fig_name=chrom + '_' + str(poses[0]) + '_prediction_ism_shuffle_gtex_snp_4_reps_gtex_pas_cov_nerve_undo_clip'\n",
     ")\n"
    ]
   },
@@ -1216,9 +1216,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:borzoi_py39_2]",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "conda-env-borzoi_py39_2-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1230,7 +1230,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.17"
+   "version": "3.9.13"
   }
  },
  "nbformat": 4,