more

rwth-i6 · Nov 6, 2024 · 432a729 · 432a729
1 parent 69423e5
commit 432a729
Showing 1 changed file with 36 additions and 0 deletions.
diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py b/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py
@@ -420,6 +420,42 @@ def py():
         env_updates={"PYTORCH_CUDA_ALLOC_CONF": "backend:cudaMallocAsync,expandable_segments:True"},
     )
 
+    # Normalize by num seqs, sum over frames.
+    train(
+        "lm/trafo-n24-d512-gelu-drop0-b2k_80k-laplace100k-spm10k-lossSeqNorm",
+        config=dict_update_deep(
+            config_96gb_bf16_accgrad1,
+            {
+                **_get_cfg_lrlin_oclr_by_bs_nep_v3(80_000, 100, batch_size_factor=1),
+                "max_seqs": 2_000,
+                "optimizer.weight_decay": 1e-2,
+                "calculate_exp_loss": True,
+                "use_normalized_loss": "seqs",
+            },
+        ),
+        post_config={"log_grad_norm": True},
+        train_dataset=get_librispeech_lm_dataset(
+            vocab="spm10k", train_epoch_split=20, train_sort_laplace_num_seqs=100_000
+        ),
+        model_def=ModelDefWithCfg(
+            lm_model_def,
+            {
+                "_model_def_dict": rf.build_dict(
+                    TransformerDecoder,
+                    encoder_dim=None,
+                    num_layers=24,
+                    model_dim=512,
+                    ff_activation=rf.build_dict(rf.gelu),
+                    dropout=0.0,
+                    att_dropout=0.0,
+                )
+            },
+        ),
+        train_def=lm_train_def,
+        # avoid oom
+        env_updates={"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"},
+    )
+
     from returnn.util.math import PiecewiseLinear
 
     # Try warmup of batch size (warmupBs).