From 9d5d11384955cf6a5ffc8bd3de66e3461697f28d Mon Sep 17 00:00:00 2001 From: mori360 Date: Tue, 3 Dec 2024 21:42:06 -0800 Subject: [PATCH] change config name --- torchtitan/checkpoint.py | 2 +- torchtitan/config_manager.py | 16 ++++++++-------- torchtitan/optimizer.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py index 684e0175..689328b3 100644 --- a/torchtitan/checkpoint.py +++ b/torchtitan/checkpoint.py @@ -245,7 +245,7 @@ def __init__( model_parts, optimizers, ) - if not job_config.training.enable_optimizer_in_backward + if not job_config.optimizer.backward else OptimizerInBackwardWrapper( model_parts, optimizers, diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index b588e781..175ae8cc 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -187,6 +187,14 @@ def __init__(self): action="store_true", help="Whether the fused implementation(CUDA only) is used.", ) + self.parser.add_argument( + "--optimizer.backward", + type=bool, + default=False, + help=""" + Whether to apply optimizer in the backward. Caution, optimizer_in_backward + is not compatible with gradients clipping.""", + ) # training configs self.parser.add_argument( @@ -270,14 +278,6 @@ def __init__(self): action="store_true", help="Whether to apply loss parallel when sequence parallel is enabled", ) - self.parser.add_argument( - "--training.enable_optimizer_in_backward", - type=bool, - default=False, - help=""" - Whether to apply optimizer in the backward. Caution, optimizer_in_backward - cannot compile with gradients clipping.""", - ) self.parser.add_argument( "--experimental.enable_async_tensor_parallel", default=False, diff --git a/torchtitan/optimizer.py b/torchtitan/optimizer.py index 3478b15a..02519de3 100644 --- a/torchtitan/optimizer.py +++ b/torchtitan/optimizer.py @@ -16,7 +16,7 @@ def build_optimizers(model_parts, job_config: JobConfig): """Wrap one optimizer per model part in an OptimizersContainer which provides a single step() and zero_grad() method for all the child optimizers. """ - optim_in_bwd = job_config.training.enable_optimizer_in_backward + optim_in_bwd = job_config.optimizer.backward def _build_optimizer(model): name = job_config.optimizer.name @@ -135,7 +135,7 @@ def linear_warmup_linear_decay( def build_lr_schedulers(optimizers, job_config: JobConfig): - optim_in_bwd = job_config.training.enable_optimizer_in_backward + optim_in_bwd = job_config.optimizer.backward def _build_lr_scheduler(optimizer): """Build a linear warmup and linear decay scheduler"""