diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py b/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py index 3eac671ae..d9c58af51 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/claix2023.py @@ -420,6 +420,42 @@ def py(): env_updates={"PYTORCH_CUDA_ALLOC_CONF": "backend:cudaMallocAsync,expandable_segments:True"}, ) + # Normalize by num seqs, sum over frames. + train( + "lm/trafo-n24-d512-gelu-drop0-b2k_80k-laplace100k-spm10k-lossSeqNorm", + config=dict_update_deep( + config_96gb_bf16_accgrad1, + { + **_get_cfg_lrlin_oclr_by_bs_nep_v3(80_000, 100, batch_size_factor=1), + "max_seqs": 2_000, + "optimizer.weight_decay": 1e-2, + "calculate_exp_loss": True, + "use_normalized_loss": "seqs", + }, + ), + post_config={"log_grad_norm": True}, + train_dataset=get_librispeech_lm_dataset( + vocab="spm10k", train_epoch_split=20, train_sort_laplace_num_seqs=100_000 + ), + model_def=ModelDefWithCfg( + lm_model_def, + { + "_model_def_dict": rf.build_dict( + TransformerDecoder, + encoder_dim=None, + num_layers=24, + model_dim=512, + ff_activation=rf.build_dict(rf.gelu), + dropout=0.0, + att_dropout=0.0, + ) + }, + ), + train_def=lm_train_def, + # avoid oom + env_updates={"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}, + ) + from returnn.util.math import PiecewiseLinear # Try warmup of batch size (warmupBs).