Merge branch 'main' into add-color-to-console-outputs

pytorch · Feb 24, 2024 · 1198e76 · 1198e76
2 parents 565ef75 + 6e17001
commit 1198e76
Show file tree

Hide file tree

Showing 16 changed files with 390 additions and 182 deletions.
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.10']
     steps:
       - name: Check out repo
         uses: actions/checkout@v3
@@ -30,10 +30,8 @@ jobs:
         run: |
           python -m pip install pre-commit
           pre-commit install-hooks
-      - id: file_changes
-        uses: trilom/file-changes-action@v1.2.4
-        with:
-          prNumber: ${{ github.event.number }}
-          output: ' '
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v41.0.0
       - name: Lint modified files
-        run: pre-commit run --files ${{ steps.file_changes.outputs.files }}
+        run: pre-commit run --files ${{ steps.changed-files.outputs.all_changed_files }}
diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml
@@ -0,0 +1,43 @@
+name: 4 GPU Unit Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  unit_tests_4gpu:
+    runs-on: linux.g5.12xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Update pip
+        run: python -m pip install --upgrade pip
+      - name: Install dependencies
+        run: |
+          pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+          python -m pip install -r requirements.txt
+          python -m pip install -r dev-requirements.txt
+          python -m pip install -e .
+      - name: Run NGPU=4 ./run_llama_train.sh
+        run: NGPU=4 ./run_llama_train.sh
+      - name: Upload Coverage to Codecov
+        uses: codecov/codecov-action@v3
diff --git a/.github/workflows/unit_test.yaml → .github/workflows/unit_test_cpu.yaml b/.github/workflows/unit_test.yaml → .github/workflows/unit_test_cpu.yaml
@@ -1,4 +1,4 @@
-name: Unit Test
+name: CPU Unit Test
 
 on:
   push:
@@ -14,7 +14,7 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
-  unit_tests:
+  cpu_unit_tests:
     runs-on: ubuntu-latest
     strategy:
       matrix:

diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -23,10 +23,8 @@ CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:-""}
 # Please adjust this to a longer interval period. The unit of measurement is in steps.
 CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:-5}
 
+CONFIG_FILE=${CONFIG_FILE:-"./torchtrain/train_configs/train_config.toml"}
+
 torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-train.py --steps 10 \
---model ${MODEL} --model_conf ${MODEL_CONF} \
---pp_degree ${PP} --sp_degree ${SP} --dp_degree ${DP} \
---compile \
---checkpoint-folder=${CHECKPOINT_FOLDER} --checkpoint-interval=${CHECKPOINT_INTERVAL}
+train.py --job.config_file ${CONFIG_FILE}
diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/test_job_config.py b/test/test_job_config.py
@@ -0,0 +1,21 @@
+import pytest
+from torchtrain.config_manager import JobConfig
+
+
+class TestJobConfig:
+    def test_command_line_args(self):
+        config = JobConfig()
+        config.parse_args([])
+        assert config.model.name == "llama"
+
+    def test_job_config_file(self):
+        config = JobConfig()
+        config.parse_args(
+            ["--job.config_file", "./torchtrain/train_configs/train_config.toml"]
+        )
+        assert config.model.name == "llama"
+
+    def test_job_file_does_not_exist(self):
+        with pytest.raises(FileNotFoundError):
+            config = JobConfig()
+            config.parse_args(["--job.config_file", "ohno.toml"])
diff --git a/test/test_test.py b/test/test_test.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+
 # delete me after adding real tests..
 class Test:
     def test_test(self):

diff --git a/torchtrain/config_manager.py b/torchtrain/config_manager.py
@@ -0,0 +1,215 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+import argparse
+import sys
+from collections import defaultdict
+from typing import Union
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+
+class JobConfig:
+    """
+    A helper class to manage the train configuration.
+    Semantics:
+    - Default config is loaded from a toml file. If no toml file is provided,
+    then the default config is loaded from argparse defaults.
+    """
+
+    def parse_args(self, args_list: list = sys.argv[1:]):
+        args = JobConfig.init_args_from_command_line(args_list)
+        config_file = getattr(args, "job.config_file", None)
+        if config_file is None:
+            args_dict = self._args_to_two_level_dict(args)
+        else:
+            with open(config_file, "rb") as f:
+                args_dict = tomllib.load(f)
+        for k, v in args_dict.items():
+            class_type = type(k.title(), (), v)
+            setattr(self, k, class_type())
+        self._validate_config()
+
+    def _args_to_two_level_dict(self, args: argparse.Namespace) -> defaultdict:
+        args_dict = defaultdict(defaultdict)
+        for k, v in vars(args).items():
+            first_level_key, second_level_key = k.split(".", 1)
+            args_dict[first_level_key][second_level_key] = v
+        return args_dict
+
+    def _validate_config(self):
+        # TODO: Add more mandatory validations
+        assert self.model.name and self.model.flavor and self.model.tokenizer_path
+        return True
+
+    @staticmethod
+    def init_args_from_command_line(
+        args_list: list = sys.argv[1:],
+    ) -> argparse.Namespace:
+        """
+        Each argument starts with <prefix>_ which is the section name in the toml file
+        followed by name of the option in the toml file. For ex,
+        model.name translates to:
+            [model]
+            name
+        in the toml file
+        """
+        parser = argparse.ArgumentParser(description="TorchTrain arg parser.")
+        parser.add_argument(
+            "--job.config_file",
+            type=str,
+            default=None,
+            help="job config file",
+        )
+
+        # misc configs
+        parser.add_argument(
+            "--job.dump_folder",
+            type=str,
+            default="./torchtrain/outputs",
+            help="folder to dump job outputs",
+        )
+
+        # profiling configs
+        parser.add_argument(
+            "--profiling.run_profiler",
+            action="store_true",
+            help="enable pytorch profiler",
+        )
+        parser.add_argument(
+            "--profiling.save_traces_folder",
+            type=str,
+            default="profiling/traces",
+            help="trace file location",
+        )
+        parser.add_argument(
+            "--profiling.profile_every_x_iter",
+            type=int,
+            default=10,
+            help="collect profiler traces every x iterations",
+        )
+        # metrics configs
+        parser.add_argument(
+            "--metrics.log_freq",
+            type=int,
+            default=10,
+            help="how often to log metrics to TensorBoard",
+        )
+        parser.add_argument(
+            "--metrics.enable_tensorboard",
+            action="store_true",
+            help="how often to log metrics to TensorBoard",
+        )
+        parser.add_argument(
+            "--metrics.save_tb_folder",
+            type=str,
+            default="tb",
+            help="folder to dump tensorboard state",
+        )
+
+        # model configs
+        parser.add_argument(
+            "--model.name",
+            type=str,
+            default="llama",
+            help="which model to train",
+        )
+        parser.add_argument(
+            "--model.flavor",
+            type=str,
+            default="debugmodel",
+            help="which model config to train",
+        )
+        parser.add_argument(
+            "--model.tokenizer_path",
+            type=str,
+            default="./torchtrain/datasets/tokenizer/tokenizer.model",
+            help="tokenizer path",
+        )
+
+        # optimizer configs
+        parser.add_argument(
+            "--optimizer.name", type=str, default="AdamW", help="optimizer to use"
+        )
+        parser.add_argument(
+            "--optimizer.lr", type=float, default=8e-4, help="learning rate to use"
+        )
+
+        # training configs
+        parser.add_argument(
+            "--training.dataset", type=str, default="alpaca", help="dataset to use"
+        )
+        parser.add_argument(
+            "--training.batch_size", type=int, default=8, help="batch size"
+        )
+        parser.add_argument(
+            "--training.seq_len", type=int, default=2048, help="sequence length"
+        )
+        parser.add_argument(
+            "--training.warmup_pct",
+            type=float,
+            default=0.20,
+            help="percentage of total training steps to use for warmup",
+        )
+        parser.add_argument(
+            "--training.max_norm",
+            type=Union[float, int],
+            default=1.0,
+            help="max norm for gradient clipping",
+        )
+        parser.add_argument(
+            "--training.steps", type=int, default=-1, help="how many train steps to run"
+        )
+        parser.add_argument(
+            "--training.data_parallel_degree",
+            type=int,
+            default=-1,
+            help="Data Parallelism degree. -1 means leftover ranks will be used (After SP/PP). 1 means disabled.",
+        )
+        parser.add_argument(
+            "--training.sequence_parallel_degree",
+            type=int,
+            default=1,
+            help="Sequence Parallelism degree.  1 means disabled.",
+        )
+        parser.add_argument(
+            "--training.pipeline_parallel_degree",
+            type=int,
+            default=1,
+            help="Pipeline Parallelism degree (default of 1 means disabled)",
+        )
+        parser.add_argument(
+            "--training.compile",
+            action="store_true",
+            help="Whether to compile the model.",
+        )
+        parser.add_argument(
+            "--training.checkpoint_interval",
+            type=int,
+            default=3600,
+            help=(
+                "Checkpointing interval. The unit of measurement is in seconds or "
+                "steps depending on --training.checkpoint-internval-type."
+            ),
+        )
+        parser.add_argument(
+            "--training.checkpoint_interval_type",
+            type=str,
+            default="steps",
+            help=(
+                "The checkpointing interval unit of measurement."
+                "The default value is step."
+            ),
+        )
+        parser.add_argument(
+            "--training.checkpoint_folder",
+            type=str,
+            default="",
+            help=(
+                "The folder to store the checkpoints. If this is not specified or "
+                "is an empty string, checkpointing is disabled."
+            ),
+        )
+        return parser.parse_args(args_list)
diff --git a/torchtrain/datasets/tokenizer.py b/torchtrain/datasets/tokenizer.py
@@ -8,8 +8,8 @@
 
 import os
 from abc import ABC, abstractmethod
-from typing import List
 from logging import getLogger
+from typing import List
 
 from sentencepiece import SentencePieceProcessor
 
@@ -48,6 +48,7 @@ def create_tokenizer(tokenizer_type: str, tokenizer_path: str) -> TokenizerIf:
 
 class SentencePieceTokenizer(TokenizerIf):
     """tokenizing and encoding/decoding text using SentencePiece."""
+
     def __init__(self, tokenizer_path: str):
         """
         Initializes the Tokenizer with a SentencePiece model.

diff --git a/torchtrain/logging_utils.py b/torchtrain/logging_utils.py
@@ -1,6 +1,7 @@
-import torch
 import logging
 
+import torch
+
 logger = logging.getLogger()
 
 

diff --git a/torchtrain/lr_scheduling.py b/torchtrain/lr_scheduling.py
@@ -2,6 +2,7 @@
 # All rights reserved.
 
 from torch.optim.lr_scheduler import LambdaLR
+from torchtrain.config_manager import JobConfig
 
 # global states for scheduling
 # these are needed as LambdaLR does not support argument passing
@@ -29,11 +30,13 @@ def linear_warmup_linear_decay(current_step: int) -> float:
     return curr_adjustment
 
 
-def get_lr_scheduler(optimizer, args):
+def get_lr_scheduler(optimizer, job_config: JobConfig):
     """Build a linear warmup and linear decay scheduler"""
     global _warmup_steps, _decay_steps
-    _warmup_steps = max(int(args.steps * args.warmup_pct), 2)
-    _decay_steps = float(max(1, args.steps - _warmup_steps))
+    _warmup_steps = max(
+        int(job_config.training.steps * job_config.training.warmup_pct), 2
+    )
+    _decay_steps = float(max(1, job_config.training.steps - _warmup_steps))
 
     warmup_scheduler = LambdaLR(optimizer, lr_lambda=linear_warmup_linear_decay)
     return warmup_scheduler