ourownstory · Constantin343 · Jun 24, 2024 · Jun 29, 2024 · Jun 30, 2024 · Jul 1, 2024
diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py
@@ -94,7 +94,7 @@
     optimizer: Union[str, Type[torch.optim.Optimizer]]
     quantiles: List[float] = field(default_factory=list)
     optimizer_args: dict = field(default_factory=dict)
-    scheduler: Optional[Type[torch.optim.lr_scheduler.OneCycleLR]] = None
+    scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None
     scheduler_args: dict = field(default_factory=dict)
     newer_samples_weight: float = 1.0
     newer_samples_start: float = 0.0
@@ -104,6 +104,7 @@
     n_data: int = field(init=False)
     loss_func_name: str = field(init=False)
     lr_finder_args: dict = field(default_factory=dict)
+    optimizer_state: dict = field(default_factory=dict)
 
     def __post_init__(self):
         # assert the uncertainty estimation params and then finalize the quantiles
@@ -189,19 +190,53 @@
 
     def set_scheduler(self):
         """
-        Set the scheduler and scheduler args.
+        Set the scheduler and scheduler arg depending on the user selection.
         The scheduler is not initialized yet as this is done in configure_optimizers in TimeNet.
         """
-        self.scheduler = torch.optim.lr_scheduler.OneCycleLR
-        self.scheduler_args.update(
-            {
-                "pct_start": 0.3,
-                "anneal_strategy": "cos",
-                "div_factor": 10.0,
-                "final_div_factor": 10.0,
-                "three_phase": True,
-            }
-        )
+        self.scheduler_args.clear()
+        if isinstance(self.scheduler, str):
+            if self.scheduler.lower() == "onecyclelr":
+                self.scheduler = torch.optim.lr_scheduler.OneCycleLR
+                self.scheduler_args.update(
+                    {
+                        "pct_start": 0.3,
+                        "anneal_strategy": "cos",
+                        "div_factor": 10.0,
+                        "final_div_factor": 10.0,
+                        "three_phase": True,
+                    }
+                )
+            elif self.scheduler.lower() == "steplr":
+                self.scheduler = torch.optim.lr_scheduler.StepLR
+                self.scheduler_args.update(
+                    {
+                        "step_size": 10,
+                        "gamma": 0.1,
+                    }
+                )
+            elif self.scheduler.lower() == "exponentiallr":
+                self.scheduler = torch.optim.lr_scheduler.ExponentialLR
+                self.scheduler_args.update(
+                    {
+                        "gamma": 0.95,
+                    }
+                )
+            elif self.scheduler.lower() == "cosineannealinglr":
+                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR
+                self.scheduler_args.update(
+                    {
+                        "T_max": 50,
+                    }
+                )
+            else:
+                raise NotImplementedError(f"Scheduler {self.scheduler} is not supported.")
+        elif self.scheduler is None:
+            self.scheduler = torch.optim.lr_scheduler.ExponentialLR
+            self.scheduler_args.update(
+                {
+                    "gamma": 0.95,
+                }
+            )
 
     def set_lr_finder_args(self, dataset_size, num_batches):
         """
@@ -239,6 +274,9 @@
             delay_weight = 1
         return delay_weight
 
+    def set_optimizer_state(self, optimizer_state: dict):
+        self.optimizer_state = optimizer_state
+
 
 @dataclass
 class Trend:
@@ -304,7 +342,7 @@
            log.error("Invalid growth for global_local mode '{}'. Set to 'global'".format(self.trend_global_local))
            self.trend_global_local = "global"

        if self.trend_local_reg < 0:
            log.error("Invalid  negative trend_local_reg '{}'. Set to False".format(self.trend_local_reg))
            self.trend_local_reg = False

@@ -353,13 +391,13 @@
            log.error("Invalid global_local mode '{}'. Set to 'global'".format(self.global_local))
            self.global_local = "global"

        self.periods = OrderedDict(
            {
                "yearly": Season(
                    resolution=6,
                    period=365.25,
                    arg=self.yearly_arg,
                    global_local=(
                        self.yearly_global_local
                        if self.yearly_global_local in ["global", "local"]
                        else self.global_local

diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py
@@ -298,6 +298,20 @@
             >>> m = NeuralProphet(collect_metrics=["MSE", "MAE", "RMSE"])
             >>> # use custorm torchmetrics names
             >>> m = NeuralProphet(collect_metrics={"MAPE": "MeanAbsolutePercentageError", "MSLE": "MeanSquaredLogError",
+        scheduler : str, torch.optim.lr_scheduler._LRScheduler
+            Type of learning rate scheduler to use.
+
+            Options
+                * (default) ``OneCycleLR``: One Cycle Learning Rate scheduler
+                * ``StepLR``: Step Learning Rate scheduler
+                * ``ExponentialLR``: Exponential Learning Rate scheduler
+                * ``CosineAnnealingLR``: Cosine Annealing Learning Rate scheduler
+
+            Examples
+            --------
+            >>> from neuralprophet import NeuralProphet
+            >>> # Step Learning Rate scheduler
+            >>> m = NeuralProphet(scheduler="StepLR")
 
         COMMENT
         Uncertainty Estimation
@@ -432,6 +446,7 @@
         batch_size: Optional[int] = None,
         loss_func: Union[str, torch.nn.modules.loss._Loss, Callable] = "SmoothL1Loss",
         optimizer: Union[str, Type[torch.optim.Optimizer]] = "AdamW",
+        scheduler: Optional[str] = "onecyclelr",
         newer_samples_weight: float = 2,
         newer_samples_start: float = 0.0,
         quantiles: List[float] = [],
@@ -505,6 +520,7 @@
         self.config_train = configure.Train(
             quantiles=quantiles,
             learning_rate=learning_rate,
+            scheduler=scheduler,
             epochs=epochs,
             batch_size=batch_size,
             loss_func=loss_func,
@@ -916,6 +932,7 @@
         continue_training: bool = False,
         num_workers: int = 0,
         deterministic: bool = False,
+        scheduler: Optional[str] = None,
     ):
         """Train, and potentially evaluate model.
 
@@ -967,14 +984,38 @@
                 Note: using multiple workers and therefore distributed training might significantly increase
                 the training time since each batch needs to be copied to each worker for each epoch. Keeping
                 all data on the main process might be faster for most datasets.
+            scheduler : str
+                Type of learning rate scheduler to use for continued training. If None, uses ExponentialLR as
+                default as specified in the model config.
+                Options
+                * ``StepLR``: Step Learning Rate scheduler
+                * ``ExponentialLR``: Exponential Learning Rate scheduler
+                * ``CosineAnnealingLR``: Cosine Annealing Learning Rate scheduler
 
         Returns
         -------
             pd.DataFrame
                 metrics with training and potentially evaluation metrics
         """
-        if self.fitted:
-            raise RuntimeError("Model has been fitted already. Please initialize a new model to fit again.")
+        if self.fitted and not continue_training:
+            raise RuntimeError(
+                "Model has been fitted already. If you want to continue training please set the flag continue_training."
+            )
+
+        if continue_training and epochs is None:
+            raise ValueError("Continued training requires setting the number of epochs to train for.")
+
+        if continue_training:
+            if scheduler is not None:
+                self.config_train.scheduler = scheduler
+            else:
+                self.config_train.scheduler = None
+            self.config_train.set_scheduler()
+
+        if scheduler is not None and not continue_training:
+            log.warning(
+                "Scheduler can only be set in fit when continuing training. Please set the scheduler when initializing the model."
+            )
 
         # Configuration
         if epochs is not None:
@@ -1060,8 +1101,9 @@
             or any(value != 1 for value in self.num_seasonalities_modelled_dict.values())
         )
 
-        if self.fitted is True and not continue_training:
-            log.error("Model has already been fitted. Re-fitting may break or produce different results.")
+        if continue_training and self.metrics_logger.checkpoint_path is None:
+            log.error("Continued training requires checkpointing in model to continue from last epoch.")
+
         self.max_lags = df_utils.get_max_num_lags(
             n_lags=self.n_lags, config_lagged_regressors=self.config_lagged_regressors
         )
@@ -2661,23 +2703,23 @@
             torch DataLoader
         """
         df, _, _, _ = df_utils.prep_or_copy_df(df)  # TODO: Can this call be avoided?
-        # if not self.fitted:
-        self.config_normalization.init_data_params(
-            df=df,
-            config_lagged_regressors=self.config_lagged_regressors,
-            config_regressors=self.config_regressors,
-            config_events=self.config_events,
-            config_seasonality=self.config_seasonality,
-        )
+        if not self.fitted:
+            self.config_normalization.init_data_params(
+                df=df,
+                config_lagged_regressors=self.config_lagged_regressors,
+                config_regressors=self.config_regressors,
+                config_events=self.config_events,
+                config_seasonality=self.config_seasonality,
+            )
 
         df = _normalize(df=df, config_normalization=self.config_normalization)
-        # if not self.fitted:
-        if self.config_trend.changepoints is not None:
-            # scale user-specified changepoint times
-            df_aux = pd.DataFrame({"ds": pd.Series(self.config_trend.changepoints)})
+        if not self.fitted:
+            if self.config_trend.changepoints is not None:
+                # scale user-specified changepoint times
+                df_aux = pd.DataFrame({"ds": pd.Series(self.config_trend.changepoints)})
 
-            df_normalized = _normalize(df=df_aux, config_normalization=self.config_normalization)
-            self.config_trend.changepoints = df_normalized["t"].values  # type: ignore
+                df_normalized = _normalize(df=df_aux, config_normalization=self.config_normalization)
+                self.config_trend.changepoints = df_normalized["t"].values  # type: ignore
 
         # df_merged, _ = df_utils.join_dataframes(df)
         # df_merged = df_merged.sort_values("ds")
@@ -2765,12 +2807,24 @@
         # Internal flag to check if validation is enabled
         validation_enabled = df_val is not None
 
-        # Init the model, if not continue from checkpoint
+        # Load model and optimizer state from checkpoint if continue_training is True
         if continue_training:
-            raise NotImplementedError(
-                "Continuing training from checkpoint is not implemented yet. This feature is planned for one of the \
-                    upcoming releases."
-            )
+            checkpoint_path = self.metrics_logger.checkpoint_path
+            checkpoint = torch.load(checkpoint_path)
+
+            checkpoint_epoch = checkpoint["epoch"] if "epoch" in checkpoint else 0
+            previous_epoch = max(self.model.current_epoch, checkpoint_epoch)
+
+            # Set continue_training flag in model to update scheduler correctly
+            self.model.continue_training = True
+            self.model.start_epoch = previous_epoch
+
+            # Adjust epochs
+            new_total_epochs = previous_epoch + self.config_train.epochs
+            self.config_train.epochs = new_total_epochs
+
+            self.config_train.set_optimizer_state(checkpoint["optimizer_states"][0])
+
         else:
             self.model = self._init_model()
 
@@ -2852,8 +2906,18 @@
 
         if not metrics_enabled:
             return None
+
         # Return metrics collected in logger as dataframe
-        metrics_df = pd.DataFrame(self.metrics_logger.history)
+        if self.metrics_logger.history is not None:
+            # avoid array mismatch when continuing training
+            history = self.metrics_logger.history
+            max_length = max(len(lst) for lst in history.values())
+            for key in history:
+                while len(history[key]) < max_length:
+                    history[key].append(None)
+            metrics_df = pd.DataFrame(history)
+        else:
+            metrics_df = pd.DataFrame()
         return metrics_df
 
     def restore_trainer(self, accelerator: Optional[str] = None):

diff --git a/neuralprophet/time_net.py b/neuralprophet/time_net.py
@@ -63,6 +63,8 @@
         num_seasonalities_modelled: int = 1,
         num_seasonalities_modelled_dict: dict = None,
         meta_used_in_model: bool = False,
+        continue_training: bool = False,
+        start_epoch: int = 0,
     ):
         """
         Parameters
@@ -312,6 +314,10 @@
         else:
             self.config_regressors.regressors = None
 
+        # Continued training
+        self.continue_training = continue_training
+        self.start_epoch = start_epoch
+
     @property
     def ar_weights(self) -> torch.Tensor:
         """sets property auto-regression weights for regularization. Update if AR is modelled differently"""
@@ -865,12 +871,34 @@
         optimizer = self._optimizer(self.parameters(), lr=self.learning_rate, **self.config_train.optimizer_args)
 
         # Scheduler
-        lr_scheduler = self._scheduler(
-            optimizer,
-            max_lr=self.learning_rate,
-            total_steps=self.trainer.estimated_stepping_batches,
-            **self.config_train.scheduler_args,
-        )
+        self._scheduler = self.config_train.scheduler
+
+        if self.continue_training:
+            optimizer.load_state_dict(self.config_train.optimizer_state)
+
+            # Update initial learning rate to the last learning rate for continued training
+            last_lr = float(optimizer.param_groups[0]["lr"])  # Ensure it's a float
+
+            for param_group in optimizer.param_groups:
+                param_group["initial_lr"] = (last_lr,)
+
+            if self._scheduler == torch.optim.lr_scheduler.OneCycleLR:
+                log.warning("OneCycleLR scheduler is not supported for continued training. Switching to ExponentialLR")
+                self._scheduler = torch.optim.lr_scheduler.ExponentialLR
+                self.config_train.scheduler_args = {"gamma": 0.95}
+
+        if self._scheduler == torch.optim.lr_scheduler.OneCycleLR:
+            lr_scheduler = self._scheduler(
+                optimizer,
+                max_lr=self.learning_rate,
+                total_steps=self.trainer.estimated_stepping_batches,
+                **self.config_train.scheduler_args,
+            )
+        else:
+            lr_scheduler = self._scheduler(
+                optimizer,
+                **self.config_train.scheduler_args,
+            )
 
         return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}