Skip to content

Commit

Permalink
Improve performance with torch compile and torchrun DDP
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalobenegas committed Dec 19, 2023
1 parent c6a6db2 commit 05b23c5
Show file tree
Hide file tree
Showing 18 changed files with 996 additions and 234 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Can also be called GPN-SS (single sequence).
- Specify config overrides: e.g. `--config_overrides n_layers=30`
- Example:
```bash
WANDB_PROJECT=your_project python -m gpn.ss.run_mlm --do_train --do_eval \
WANDB_PROJECT=your_project torchrun --nproc_per_node=$(echo $CUDA_VISIBLE_DEVICES | awk -F',' '{print NF}') -m gpn.ss.run_mlm --do_train --do_eval \
--fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
--dataset_name results/dataset --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \
Expand All @@ -54,20 +54,21 @@ WANDB_PROJECT=your_project python -m gpn.ss.run_mlm --do_train --do_eval \
--eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000 \
--learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \
--run_name your_run --output_dir your_output_dir --model_type ConvNet \
--per_device_train_batch_size 512 --per_device_eval_batch_size 512 --gradient_accumulation_steps 1
--per_device_train_batch_size 512 --per_device_eval_batch_size 512 --gradient_accumulation_steps 1 \
--torch_compile
```
3. Extract embeddings
- Input file requires `chrom`, `start`, `end`
- Example:
```bash
python -m gpn.ss.get_embeddings windows.parquet genome.fa.gz 100 your_output_dir \
torchrun --nproc_per_node=$(echo $CUDA_VISIBLE_DEVICES | awk -F',' '{print NF}') -m gpn.ss.get_embeddings windows.parquet genome.fa.gz 100 your_output_dir \
results.parquet --per-device-batch-size 4000 --is-file --dataloader-num-workers 16
```
4. Variant effect prediction
- Input file requires `chrom`, `pos`, `ref`, `alt`
- Example:
```bash
python -m gpn.ss.run_vep variants.parquet genome.fa.gz 512 your_output_dir results.parquet \
torchrun --nproc_per_node=$(echo $CUDA_VISIBLE_DEVICES | awk -F',' '{print NF}') -m gpn.ss.run_vep variants.parquet genome.fa.gz 512 your_output_dir results.parquet \
--per-device-batch-size 4000 --is-file --dataloader-num-workers 16
```

Expand Down
11 changes: 6 additions & 5 deletions analysis/human/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,29 @@ hparams = {
"multiz100way/89/128/64/True/defined.phastCons.percentile-75_0.05_0.001",

# changing percentage
"multiz100way/89/128/64/True/defined.phastCons.percentile-75_0.50_0.001",
#"multiz100way/89/128/64/True/defined.phastCons.percentile-75_0.50_0.001",

# using more species
"multiz100way/99/128/64/True/defined.phastCons.percentile-75_0.05_0.001",
#"multiz100way/99/128/64/True/defined.phastCons.percentile-75_0.05_0.001",

# changing window size
"multiz100way/89/64/32/True/defined.phastCons.percentile-75_0.05_0.001",
"multiz100way/89/256/128/True/defined.phastCons.percentile-75_0.05_0.001",
],
"use_aux_features": [
True,
False,
#False,
],
"loss_weight": [
0.1,
],
"weight_conserved": [
True,
False,
#False,
],
"flip_nonconserved": [
True,
False,
#False,
],
}

Expand Down
23 changes: 14 additions & 9 deletions analysis/human/workflow/notebooks/gwas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,10 @@
" \"LongSafari/hyenadna-large-1m-seqlen-hf\",\n",
" \"multiz100way/89/128/64/True/defined.phastCons.percentile-75_0.05_0.001/medium/0.1/42/30000/True/True/True\",\n",
"]\n",
"embedding_models = models[-3:] + [\"Enformer\"]\n",
"embedding_models = models[-3:] + [\n",
" \"Enformer\",\n",
" #\"multiz100way/89/256/128/True/defined.phastCons.percentile-75_0.05_0.001/medium/0.1/42/30000/True/True/True\",\n",
"]\n",
"\n",
"\n",
"model_renaming = {\n",
Expand All @@ -583,6 +586,8 @@
" \"phyloP-Zoonomia\": \"phyloP-241-mammals\",\n",
" \"InstaDeepAI/nucleotide-transformer-2.5b-multi-species\": \"NT\",\n",
" \"LongSafari/hyenadna-large-1m-seqlen-hf\": \"HyenaDNA\",\n",
" \n",
" \"multiz100way/89/256/128/True/defined.phastCons.percentile-75_0.05_0.001/medium/0.1/42/30000/True/True/True\": \"GPN-MSA-256\",\n",
"}\n",
"\n",
"V = pd.read_parquet(f\"../../{dataset_path}/test.parquet\")\n",
Expand Down Expand Up @@ -1900,7 +1905,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2560/2560 [00:05<00:00, 459.16it/s]\n"
"100%|██████████| 2560/2560 [00:05<00:00, 435.23it/s]\n"
]
},
{
Expand All @@ -1915,7 +1920,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 256/256 [00:00<00:00, 457.41it/s]\n"
"100%|██████████| 256/256 [00:00<00:00, 443.75it/s]\n"
]
},
{
Expand All @@ -1930,7 +1935,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 768/768 [00:01<00:00, 481.92it/s]\n"
"100%|██████████| 768/768 [00:01<00:00, 436.82it/s]\n"
]
},
{
Expand All @@ -1945,7 +1950,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5313/5313 [00:11<00:00, 456.02it/s]\n"
"100%|██████████| 5313/5313 [00:12<00:00, 419.97it/s]\n"
]
},
{
Expand All @@ -1962,7 +1967,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 2560/2560 [00:05<00:00, 460.34it/s]\n"
"100%|██████████| 2560/2560 [00:05<00:00, 439.83it/s]\n"
]
},
{
Expand All @@ -1977,7 +1982,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 256/256 [00:00<00:00, 466.68it/s]\n"
"100%|██████████| 256/256 [00:00<00:00, 437.20it/s]\n"
]
},
{
Expand All @@ -1992,7 +1997,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 768/768 [00:01<00:00, 485.14it/s]\n"
"100%|██████████| 768/768 [00:01<00:00, 445.33it/s]\n"
]
},
{
Expand All @@ -2007,7 +2012,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5313/5313 [00:11<00:00, 461.90it/s]\n"
"100%|██████████| 5313/5313 [00:12<00:00, 427.87it/s]\n"
]
},
{
Expand Down
Loading

0 comments on commit 05b23c5

Please sign in to comment.